一、加载数据¶
加载数据(train+test)¶
In [ ]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
# from sklearn.linear_model import LogisticR2egression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import tree
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import seaborn as sns
# 解决中文乱码问题
import matplotlib
import matplotlib.font_manager as font_manager
matplotlib.rcParams['font.sans-serif'] = ['FZSongYi-Z13S'] # 指定默认字体
matplotlib.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
#显示全部特征
pd.set_option('display.max_columns', None)
# 读取训练集和测试集
train_data = pd.read_csv('./userlostprob_train.txt', sep='\t')
test_data = pd.read_csv('./userlostprob_test.txt', sep='\t')
# 查看数据结构
print(train_data.head())
print(test_data.head())
label sampleid d arrival iforderpv_24h decisionhabit_user \ 0 0 24636 2016-05-18 2016-05-18 0 NaN 1 1 24637 2016-05-18 2016-05-18 0 NaN 2 0 24641 2016-05-18 2016-05-19 0 NaN 3 0 24642 2016-05-18 2016-05-18 0 NaN 4 1 24644 2016-05-18 2016-05-19 0 NaN historyvisit_7ordernum historyvisit_totalordernum hotelcr \ 0 NaN NaN 1.04 1 NaN NaN 1.06 2 NaN NaN 1.05 3 NaN NaN 1.01 4 NaN NaN 1.00 ordercanceledprecent landhalfhours ordercanncelednum commentnums \ 0 NaN 22.0 NaN 1089.0 1 NaN 0.0 NaN 5612.0 2 NaN 3.0 NaN 256.0 3 NaN 2.0 NaN NaN 4 NaN 0.0 NaN NaN starprefer novoters consuming_capacity historyvisit_avghotelnum \ 0 NaN 1933.0 NaN NaN 1 NaN 6852.0 NaN NaN 2 NaN 367.0 NaN NaN 3 NaN NaN NaN NaN 4 NaN NaN NaN NaN cancelrate historyvisit_visit_detailpagenum delta_price1 \ 0 1261.0 NaN NaN 1 3205.0 NaN NaN 2 194.0 NaN NaN 3 3.0 NaN NaN 4 NaN NaN NaN price_sensitive hoteluv businessrate_pre ordernum_oneyear cr_pre \ 0 NaN 102.607 0.25 NaN 1.03 1 NaN 278.373 0.51 NaN 1.07 2 NaN 16.133 0.61 NaN 1.12 3 NaN 1.780 NaN NaN 1.01 4 NaN 0.073 NaN NaN 1.03 avgprice lowestprice firstorder_bu customereval_pre2 delta_price2 \ 0 NaN 49.0 NaN 3.2 NaN 1 NaN 619.0 NaN 4.9 NaN 2 NaN 312.0 NaN 3.9 NaN 3 NaN 198.0 NaN 2.1 NaN 4 NaN NaN NaN 1.5 NaN commentnums_pre customer_value_profit commentnums_pre2 cancelrate_pre \ 0 724.0 NaN 844.0 0.03 1 5610.0 NaN 3789.0 0.21 2 4721.0 NaN 4341.0 0.52 3 41.0 NaN 529.0 0.53 4 NaN NaN NaN 1.00 novoters_pre2 novoters_pre ctrip_profits deltaprice_pre2_t1 \ 0 1335.0 1249.0 NaN 29.0 1 5430.0 7829.0 NaN -56.0 2 5353.0 7324.0 NaN 8.0 3 1004.0 81.0 NaN -7.0 4 1.0 NaN NaN -5.0 lowestprice_pre uv_pre uv_pre2 lowestprice_pre2 lasthtlordergap \ 0 46.0 58.027 74.956 615.0 NaN 1 111.0 249.347 224.920 513.0 NaN 2 413.0 133.093 112.063 382.0 NaN 3 188.0 4.600 58.844 203.0 NaN 4 NaN 0.213 0.157 84.0 NaN businessrate_pre2 cityuvs cityorders lastpvgap cr sid \ 0 0.29 12.880 3.147 NaN NaN 7 1 0.53 17.933 4.913 NaN NaN 33 2 0.60 3.993 0.760 NaN NaN 10 3 0.18 3.220 0.660 NaN NaN 8 4 NaN 0.013 NaN NaN NaN 1 visitnum_oneyear h 0 NaN 12 1 NaN 14 2 NaN 19 3 NaN 16 4 NaN 21 sampleid d arrival iforderpv_24h decisionhabit_user \ 0 2 2016-05-22 2016-05-23 0 4.0 1 7 2016-05-22 2016-06-15 0 7.0 2 14 2016-05-22 2016-05-22 0 NaN 3 19 2016-05-22 2016-05-22 0 1.0 4 20 2016-05-22 2016-05-22 0 4.0 historyvisit_7ordernum historyvisit_totalordernum hotelcr \ 0 NaN 4.0 1.03 1 NaN NaN 1.03 2 NaN NaN 1.02 3 NaN 5.0 1.01 4 NaN 9.0 1.06 ordercanceledprecent landhalfhours ordercanncelednum commentnums \ 0 0.00 0.0 0.0 3866.0 1 NaN 1.0 NaN 1377.0 2 NaN 0.0 NaN 11846.0 3 0.20 21.0 1.0 242.0 4 0.27 2.0 7.0 453.0 starprefer novoters consuming_capacity historyvisit_avghotelnum \ 0 96.7 5137.0 63.0 3.3 1 NaN 1754.0 NaN 7.0 2 NaN 14931.0 NaN NaN 3 70.0 329.0 40.0 1.0 4 40.0 602.0 9.0 5.2 cancelrate historyvisit_visit_detailpagenum delta_price1 \ 0 2191.0 7.0 167.0 1 1284.0 13.0 NaN 2 6110.0 NaN NaN 3 378.0 2.0 157.0 4 174.0 19.0 17.0 price_sensitive hoteluv businessrate_pre ordernum_oneyear cr_pre \ 0 9.0 300.747 0.52 4.0 1.05 1 NaN 243.720 0.67 NaN 1.06 2 NaN 1547.253 0.04 NaN 1.01 3 24.0 139.827 NaN 5.0 1.03 4 2.0 12.940 0.01 9.0 1.06 avgprice lowestprice firstorder_bu customereval_pre2 delta_price2 \ 0 635.0 723.0 NaN 5.0 79.0 1 NaN 889.0 NaN 5.0 NaN 2 NaN 722.0 NaN 4.8 NaN 3 359.0 334.0 NaN 2.9 94.0 4 97.0 118.0 13.0 2.0 17.0 commentnums_pre customer_value_profit commentnums_pre2 cancelrate_pre \ 0 1161.0 3.230 1352.0 0.18 1 1940.0 NaN 2767.0 0.29 2 2089.0 NaN 5992.0 0.12 3 2.0 2.466 220.0 0.36 4 172.0 -0.016 102.0 0.21 novoters_pre2 novoters_pre ctrip_profits deltaprice_pre2_t1 \ 0 2146.0 1612.0 3.227 -13.0 1 4087.0 2689.0 2.853 NaN 2 6650.0 3263.0 NaN 0.0 3 324.0 10.0 2.460 -14.0 4 170.0 297.0 NaN 7.0 lowestprice_pre uv_pre uv_pre2 lowestprice_pre2 lasthtlordergap \ 0 468.0 51.593 197.800 556.0 149965.0 1 1090.0 216.500 168.276 833.0 NaN 2 623.0 1081.507 1136.691 640.0 NaN 3 202.0 2.200 59.645 265.0 116831.0 4 80.0 3.827 2.993 79.0 3554.0 businessrate_pre2 cityuvs cityorders lastpvgap cr sid \ 0 0.31 1.773 0.153 204.0 1.00 46 1 0.60 0.993 0.093 7364.0 1.00 77 2 0.03 13.067 2.227 NaN NaN 54 3 0.05 3.247 0.173 4689.0 1.00 56 4 0.04 8.747 1.960 2026.0 1.89 75 visitnum_oneyear h 0 1545.0 22 1 1084.0 21 2 NaN 1 3 336.0 9 4 1416.0 16
In [ ]:
def isSame(train_data,test_data):
# 找出存在于一个DataFrame中但不在另一个中的列
unique_to_train = train_data.columns.difference(test_data.columns)
unique_to_test = test_data.columns.difference(train_data.columns)
# 输出结果
if unique_to_train.empty and unique_to_test.empty:
print("字段完全一样。")
else:
print("字段不完全一样。")
print(f"在train_data中独有的字段: {unique_to_train}")
print(f"在test_data中独有的字段: {unique_to_test}")
isSame(train_data,test_data)
字段不完全一样。 在train_data中独有的字段: Index(['label'], dtype='object') 在test_data中独有的字段: Index([], dtype='object')
查询数据信息¶
基本信息(字段+非Null+类型)¶
In [ ]:
# Column Non-Null Count Dtype
#查看数据信息
train_data.info()
#查看数据信息
test_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 689945 entries, 0 to 689944 Data columns (total 51 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 label 689945 non-null int64 1 sampleid 689945 non-null int64 2 d 689945 non-null object 3 arrival 689945 non-null object 4 iforderpv_24h 689945 non-null int64 5 decisionhabit_user 385450 non-null float64 6 historyvisit_7ordernum 82915 non-null float64 7 historyvisit_totalordernum 386525 non-null float64 8 hotelcr 689148 non-null float64 9 ordercanceledprecent 447831 non-null float64 10 landhalfhours 661312 non-null float64 11 ordercanncelednum 447831 non-null float64 12 commentnums 622029 non-null float64 13 starprefer 464892 non-null float64 14 novoters 672918 non-null float64 15 consuming_capacity 463837 non-null float64 16 historyvisit_avghotelnum 387876 non-null float64 17 cancelrate 678227 non-null float64 18 historyvisit_visit_detailpagenum 307234 non-null float64 19 delta_price1 437146 non-null float64 20 price_sensitive 463837 non-null float64 21 hoteluv 689148 non-null float64 22 businessrate_pre 483896 non-null float64 23 ordernum_oneyear 447831 non-null float64 24 cr_pre 660548 non-null float64 25 avgprice 457261 non-null float64 26 lowestprice 687931 non-null float64 27 firstorder_bu 376993 non-null float64 28 customereval_pre2 661312 non-null float64 29 delta_price2 437750 non-null float64 30 commentnums_pre 598368 non-null float64 31 customer_value_profit 439123 non-null float64 32 commentnums_pre2 648457 non-null float64 33 cancelrate_pre 653015 non-null float64 34 novoters_pre2 657616 non-null float64 35 novoters_pre 648956 non-null float64 36 ctrip_profits 445187 non-null float64 37 deltaprice_pre2_t1 543180 non-null float64 38 lowestprice_pre 659689 non-null float64 39 uv_pre 660548 non-null float64 40 uv_pre2 661189 non-null float64 41 lowestprice_pre2 660664 non-null float64 42 lasthtlordergap 447831 non-null float64 43 businessrate_pre2 602960 non-null float64 44 cityuvs 682274 non-null float64 45 cityorders 651263 non-null float64 46 lastpvgap 592818 non-null float64 47 cr 457896 non-null float64 48 sid 689945 non-null int64 49 visitnum_oneyear 592910 non-null float64 50 h 689945 non-null int64 dtypes: float64(44), int64(5), object(2) memory usage: 268.5+ MB <class 'pandas.core.frame.DataFrame'> RangeIndex: 435075 entries, 0 to 435074 Data columns (total 50 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sampleid 435075 non-null int64 1 d 435075 non-null object 2 arrival 435075 non-null object 3 iforderpv_24h 435075 non-null int64 4 decisionhabit_user 280899 non-null float64 5 historyvisit_7ordernum 38543 non-null float64 6 historyvisit_totalordernum 270733 non-null float64 7 hotelcr 433263 non-null float64 8 ordercanceledprecent 136708 non-null float64 9 landhalfhours 416027 non-null float64 10 ordercanncelednum 136708 non-null float64 11 commentnums 389702 non-null float64 12 starprefer 140768 non-null float64 13 novoters 424681 non-null float64 14 consuming_capacity 140516 non-null float64 15 historyvisit_avghotelnum 280899 non-null float64 16 cancelrate 426018 non-null float64 17 historyvisit_visit_detailpagenum 185397 non-null float64 18 delta_price1 131464 non-null float64 19 price_sensitive 140534 non-null float64 20 hoteluv 433263 non-null float64 21 businessrate_pre 301030 non-null float64 22 ordernum_oneyear 136708 non-null float64 23 cr_pre 414281 non-null float64 24 avgprice 138804 non-null float64 25 lowestprice 433705 non-null float64 26 firstorder_bu 107166 non-null float64 27 customereval_pre2 416027 non-null float64 28 delta_price2 131612 non-null float64 29 commentnums_pre 374104 non-null float64 30 customer_value_profit 134568 non-null float64 31 commentnums_pre2 407392 non-null float64 32 cancelrate_pre 409237 non-null float64 33 novoters_pre2 413810 non-null float64 34 novoters_pre 408418 non-null float64 35 ctrip_profits 134329 non-null float64 36 deltaprice_pre2_t1 327667 non-null float64 37 lowestprice_pre 415073 non-null float64 38 uv_pre 414281 non-null float64 39 uv_pre2 415713 non-null float64 40 lowestprice_pre2 415560 non-null float64 41 lasthtlordergap 136708 non-null float64 42 businessrate_pre2 378710 non-null float64 43 cityuvs 429798 non-null float64 44 cityorders 359043 non-null float64 45 lastpvgap 167631 non-null float64 46 cr 314164 non-null float64 47 sid 435075 non-null int64 48 visitnum_oneyear 167666 non-null float64 49 h 435075 non-null int64 dtypes: float64(44), int64(4), object(2) memory usage: 166.0+ MB
label分布(1+0)¶
In [ ]:
# label分布
train_data.label.value_counts()
Out[ ]:
label 0 500588 1 189357 Name: count, dtype: int64
查看数据形状(行+字段)¶
In [ ]:
train_data.shape # (689945, 51)
# test_data.shape # (435075, 50)
Out[ ]:
(689945, 51)
客户流失比率¶
In [ ]:
print('客户流失比率:{0:.2%}'.format(train_data['label'].value_counts()[1]/sum(train_data['label'].value_counts())))
客户流失比率:27.45%
二、清洗数据¶
copy 原始数据(便于后续增删改查)¶
In [ ]:
train_data_rawdf = train_data.copy()
test_data_rawdf = test_data.copy()
In [ ]:
def drop_date(rawdf):
if 'day_advanced' in rawdf.columns:
return
rawdf['arrival'] = pd.to_datetime(rawdf['arrival'] )
rawdf['d'] = pd.to_datetime(rawdf['d'])
# 生成提前预定天数(衍生变量)(到达日期-访问日期间隔)(看提前多少天订)
rawdf['day_advanced'] = (rawdf['arrival']-rawdf['d']).dt.days
# 时间格式
rawdf['d'] = pd.to_datetime(rawdf['d'], format = '%Y-%m-%d')
rawdf['arrival'] = pd.to_datetime(rawdf['arrival'], format='%Y-%m-%d')
# 用户周几入住
rawdf['arrival_weekday'] = rawdf['arrival'].map(lambda x:x.weekday())
# 用户入住那天是否为休息日
def is_weekend(a):
if int(a) in [0,1,2,3,4]:
return 0 # 0代表是工作日
else:
return 1 # 1代表是周末
rawdf['is_arrival_weekend'] = rawdf['arrival_weekday'].map(lambda x: is_weekend(x))
rawdf.drop(labels=['d','arrival'], axis=1, inplace=True)
drop_date(train_data_rawdf)
drop_date(test_data_rawdf)
In [ ]:
isSame(train_data_rawdf,test_data_rawdf)
字段不完全一样。 在train_data中独有的字段: Index(['label'], dtype='object') 在test_data中独有的字段: Index([], dtype='object')
查看均值方差(describe)¶
In [ ]:
desc_stats = train_data_rawdf.describe().T
# 计算偏度和峰度
desc_stats['skew'] = train_data_rawdf.skew()
desc_stats['kurt'] = train_data_rawdf.kurt()
# test_data_rawdf['skew'] = test_data_rawdf.skew()
# test_data_rawdf['kurt'] = test_data_rawdf.kurt()
# test_data_rawdf.describe().T
desc_stats.head(10)
# count:
# 字段缺失,特征列存在不同程度的缺失情况,因此部分特征列的count数不全为689945,
# 如historyvisit_7ordernum仅有82915条,存在数据缺失。后面进行缺失值填充的时候要注意分布的形态。
# 负数:
# 不应为负的数据特征列存在负值的情况,
# 如:delta_price1(用户偏好价格-24小时浏览最多酒店价格)、lowestprice、delta_price2、customer_value_profit(客户近一年的价值),这些负值属于异常情况,后面需要对负值进行处理
# 方差:
# 数据特征列存在极值情况,方差很大,这样的数据需要对其极值进行处理。
Out[ ]:
| count | mean | std | min | 25% | 50% | 75% | max | skew | kurt | |
|---|---|---|---|---|---|---|---|---|---|---|
| label | 689945.0 | 0.274452 | 0.446238 | 0.0 | 0.00 | 0.00 | 1.00 | 1.00 | 1.010888 | -0.978109 |
| sampleid | 689945.0 | 628540.209625 | 414681.498697 | 24636.0 | 312320.00 | 599637.00 | 887460.00 | 2238426.00 | 1.342328 | 3.576609 |
| iforderpv_24h | 689945.0 | 0.193737 | 0.395226 | 0.0 | 0.00 | 0.00 | 0.00 | 1.00 | 1.549817 | 0.401934 |
| decisionhabit_user | 385450.0 | 5.317048 | 38.524483 | 0.0 | 2.00 | 3.00 | 5.00 | 3167.00 | 50.595261 | 2685.315258 |
| historyvisit_7ordernum | 82915.0 | 1.856094 | 2.103862 | 1.0 | 1.00 | 1.00 | 2.00 | 106.00 | 28.261263 | 1347.099159 |
| historyvisit_totalordernum | 386525.0 | 11.710487 | 17.251429 | 1.0 | 2.00 | 6.00 | 14.00 | 711.00 | 4.656652 | 47.507033 |
| hotelcr | 689148.0 | 1.060996 | 0.045264 | 1.0 | 1.03 | 1.05 | 1.09 | 3.18 | 2.830497 | 68.146802 |
| ordercanceledprecent | 447831.0 | 0.342119 | 0.354210 | 0.0 | 0.00 | 0.25 | 0.57 | 1.00 | 0.655980 | -0.929859 |
| landhalfhours | 661312.0 | 6.086366 | 12.413225 | 0.0 | 0.00 | 0.00 | 4.00 | 49.00 | 2.178247 | 3.569327 |
| ordercanncelednum | 447831.0 | 154.179369 | 398.456986 | 0.0 | 0.00 | 2.00 | 153.00 | 13475.00 | 7.527943 | 127.526808 |
查看数据缺失率(bar)¶
In [ ]:
train_data_rawdf.isnull().mean()
Out[ ]:
label 0.000000 sampleid 0.000000 iforderpv_24h 0.000000 decisionhabit_user 0.441332 historyvisit_7ordernum 0.879824 historyvisit_totalordernum 0.439774 hotelcr 0.001155 ordercanceledprecent 0.350918 landhalfhours 0.041500 ordercanncelednum 0.350918 commentnums 0.098437 starprefer 0.326190 novoters 0.024679 consuming_capacity 0.327719 historyvisit_avghotelnum 0.437816 cancelrate 0.016984 historyvisit_visit_detailpagenum 0.554698 delta_price1 0.366405 price_sensitive 0.327719 hoteluv 0.001155 businessrate_pre 0.298646 ordernum_oneyear 0.350918 cr_pre 0.042608 avgprice 0.337250 lowestprice 0.002919 firstorder_bu 0.453590 customereval_pre2 0.041500 delta_price2 0.365529 commentnums_pre 0.132731 customer_value_profit 0.363539 commentnums_pre2 0.060132 cancelrate_pre 0.053526 novoters_pre2 0.046857 novoters_pre 0.059409 ctrip_profits 0.354750 deltaprice_pre2_t1 0.212720 lowestprice_pre 0.043853 uv_pre 0.042608 uv_pre2 0.041679 lowestprice_pre2 0.042440 lasthtlordergap 0.350918 businessrate_pre2 0.126075 cityuvs 0.011118 cityorders 0.056065 lastpvgap 0.140775 cr 0.336330 sid 0.000000 visitnum_oneyear 0.140642 h 0.000000 day_advanced 0.000000 arrival_weekday 0.000000 is_arrival_weekend 0.000000 dtype: float64
查看缺失率(train)¶
In [ ]:
#查看缺失值比例
train_data_rawdf.isnull().mean().sort_values(ascending=False).plot(kind='bar', figsize=(20,10))
# 缺失:
# 看出字段缺失情况严重,其中historyvisit_7ordernum缺失值高达88%。除了arrival,d,h,sampleid,iforderpv_24h,sid,label外,
# 填充:
# 其余44列字段各有不同程度缺失。因此后面要根据缺失情况,结合数据特征分布,选用合适的方法填充缺失值。
Out[ ]:
<Axes: >
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
查看缺失率(test)¶
In [ ]:
#查看缺失值比例
# train_data.isnull().mean().sort_values(ascending=False).plot(kind='bar', figsize=(20,10))
# 缺失:
# 看出字段缺失情况严重,其中historyvisit_7ordernum缺失值高达88%。除了arrival,d,h,sampleid,iforderpv_24h,sid,label外,
# 填充:
# 其余44列字段各有不同程度缺失。因此后面要根据缺失情况,结合数据特征分布,选用合适的方法填充缺失值。
def get_na_ratio(data):
plt.rcParams['font.sans-serif']=['SimHei']
data_count = data.count()
na_count = len(data) - data_count
na_rate = na_count/len(data)
a = na_rate.sort_values(ascending=True) #按values正序排列,不放倒序是为了后边的图形展示排列
a1 = pd.DataFrame(a)
x = data.shape[1]
fig = plt.figure(figsize=(8,12)) #图形大小
plt.barh(range(x),a1[0],color='steelblue',alpha=1)
plt.xlabel('数据缺失占比') #添加轴标签
columns1 = a1.index.values.tolist() #列名称
plt.yticks(range(x),columns1)
plt.xlim([0,1]) #设置X轴的刻度范围
for x,y in enumerate(a1[0]):
plt.text(y,x,'%.3f' %y,va='bottom')
plt.show()
get_na_ratio(test_data_rawdf)
查看数据分布偏态情况(skew)¶
In [ ]:
# train_data_rawdf.skew().sort_values()
train_data_rawdf.head()
Out[ ]:
| label | sampleid | iforderpv_24h | decisionhabit_user | historyvisit_7ordernum | historyvisit_totalordernum | hotelcr | ordercanceledprecent | landhalfhours | ordercanncelednum | commentnums | starprefer | novoters | consuming_capacity | historyvisit_avghotelnum | cancelrate | historyvisit_visit_detailpagenum | delta_price1 | price_sensitive | hoteluv | businessrate_pre | ordernum_oneyear | cr_pre | avgprice | lowestprice | firstorder_bu | customereval_pre2 | delta_price2 | commentnums_pre | customer_value_profit | commentnums_pre2 | cancelrate_pre | novoters_pre2 | novoters_pre | ctrip_profits | deltaprice_pre2_t1 | lowestprice_pre | uv_pre | uv_pre2 | lowestprice_pre2 | lasthtlordergap | businessrate_pre2 | cityuvs | cityorders | lastpvgap | cr | sid | visitnum_oneyear | h | day_advanced | arrival_weekday | is_arrival_weekend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 24636 | 0 | NaN | NaN | NaN | 1.04 | NaN | 22.0 | NaN | 1089.0 | NaN | 1933.0 | NaN | NaN | 1261.0 | NaN | NaN | NaN | 102.607 | 0.25 | NaN | 1.03 | NaN | 49.0 | NaN | 3.2 | NaN | 724.0 | NaN | 844.0 | 0.03 | 1335.0 | 1249.0 | NaN | 29.0 | 46.0 | 58.027 | 74.956 | 615.0 | NaN | 0.29 | 12.880 | 3.147 | NaN | NaN | 7 | NaN | 12 | 0 | 2 | 0 |
| 1 | 1 | 24637 | 0 | NaN | NaN | NaN | 1.06 | NaN | 0.0 | NaN | 5612.0 | NaN | 6852.0 | NaN | NaN | 3205.0 | NaN | NaN | NaN | 278.373 | 0.51 | NaN | 1.07 | NaN | 619.0 | NaN | 4.9 | NaN | 5610.0 | NaN | 3789.0 | 0.21 | 5430.0 | 7829.0 | NaN | -56.0 | 111.0 | 249.347 | 224.920 | 513.0 | NaN | 0.53 | 17.933 | 4.913 | NaN | NaN | 33 | NaN | 14 | 0 | 2 | 0 |
| 2 | 0 | 24641 | 0 | NaN | NaN | NaN | 1.05 | NaN | 3.0 | NaN | 256.0 | NaN | 367.0 | NaN | NaN | 194.0 | NaN | NaN | NaN | 16.133 | 0.61 | NaN | 1.12 | NaN | 312.0 | NaN | 3.9 | NaN | 4721.0 | NaN | 4341.0 | 0.52 | 5353.0 | 7324.0 | NaN | 8.0 | 413.0 | 133.093 | 112.063 | 382.0 | NaN | 0.60 | 3.993 | 0.760 | NaN | NaN | 10 | NaN | 19 | 1 | 3 | 0 |
| 3 | 0 | 24642 | 0 | NaN | NaN | NaN | 1.01 | NaN | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 | NaN | NaN | NaN | 1.780 | NaN | NaN | 1.01 | NaN | 198.0 | NaN | 2.1 | NaN | 41.0 | NaN | 529.0 | 0.53 | 1004.0 | 81.0 | NaN | -7.0 | 188.0 | 4.600 | 58.844 | 203.0 | NaN | 0.18 | 3.220 | 0.660 | NaN | NaN | 8 | NaN | 16 | 0 | 2 | 0 |
| 4 | 1 | 24644 | 0 | NaN | NaN | NaN | 1.00 | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.073 | NaN | NaN | 1.03 | NaN | NaN | NaN | 1.5 | NaN | NaN | NaN | NaN | 1.00 | 1.0 | NaN | NaN | -5.0 | NaN | 0.213 | 0.157 | 84.0 | NaN | NaN | 0.013 | NaN | NaN | NaN | 1 | NaN | 21 | 1 | 3 | 0 |
数据分布偏态情况¶
In [ ]:
train_data_rawdf.skew().sort_values()
# =0:
# 当数据呈左右对称分布时,偏度系数等于0。
# >1||<-1:
# 偏度系数大于1或小于-1,视为严重偏斜分布;
# -0.5~-1 || 0.5~1:
# 偏度系数为0.5~1或-1~-0.5,视为中等偏斜分布;
# -0.5~0 || 0~0.5:
# 偏度系数为**-0.5~0或0~0.5**,视为轻微偏斜分布。
# 由上面可以看出,除了businessrate_pre2, businessrate_pre, customereval_pre2,其他数据基本都呈很大的偏态分布。
test_data_rawdf.skew().sort_values()
Out[ ]:
delta_price1 -64.101019 delta_price2 -18.164185 deltaprice_pre2_t1 -3.027764 firstorder_bu -1.607174 sampleid -1.446678 h -0.665304 starprefer -0.404704 arrival_weekday -0.186010 businessrate_pre2 -0.071087 customereval_pre2 -0.041711 businessrate_pre -0.011670 ordercanceledprecent 0.649939 consuming_capacity 0.931010 is_arrival_weekend 0.965172 cancelrate_pre 1.274742 price_sensitive 1.523604 iforderpv_24h 1.667979 lasthtlordergap 1.778979 hotelcr 1.785177 day_advanced 1.821458 cr_pre 1.913542 landhalfhours 2.109167 cityuvs 2.241206 cityorders 2.413386 historyvisit_7ordernum 2.688576 avgprice 3.060426 ordernum_oneyear 3.475545 cancelrate 3.801247 uv_pre2 3.959311 uv_pre 4.228314 historyvisit_totalordernum 4.235041 hoteluv 4.445247 commentnums_pre2 4.603914 novoters_pre2 4.632928 lastpvgap 5.034694 ordercanncelednum 5.127708 novoters_pre 5.332249 commentnums_pre 5.352293 novoters 5.499180 commentnums 5.570767 ctrip_profits 6.034621 customer_value_profit 6.391364 sid 7.280452 decisionhabit_user 7.387129 historyvisit_avghotelnum 7.404542 historyvisit_visit_detailpagenum 9.843808 visitnum_oneyear 17.849537 lowestprice_pre2 20.223499 cr 24.179435 lowestprice_pre 56.521439 lowestprice 62.703681 dtype: float64
In [ ]:
# 查看数据分布图
train_data_rawdf.hist(figsize=(20,20))
Out[ ]:
array([[<Axes: title={'center': 'label'}>,
<Axes: title={'center': 'sampleid'}>,
<Axes: title={'center': 'iforderpv_24h'}>,
<Axes: title={'center': 'decisionhabit_user'}>,
<Axes: title={'center': 'historyvisit_7ordernum'}>,
<Axes: title={'center': 'historyvisit_totalordernum'}>,
<Axes: title={'center': 'hotelcr'}>],
[<Axes: title={'center': 'ordercanceledprecent'}>,
<Axes: title={'center': 'landhalfhours'}>,
<Axes: title={'center': 'ordercanncelednum'}>,
<Axes: title={'center': 'commentnums'}>,
<Axes: title={'center': 'starprefer'}>,
<Axes: title={'center': 'novoters'}>,
<Axes: title={'center': 'consuming_capacity'}>],
[<Axes: title={'center': 'historyvisit_avghotelnum'}>,
<Axes: title={'center': 'cancelrate'}>,
<Axes: title={'center': 'historyvisit_visit_detailpagenum'}>,
<Axes: title={'center': 'delta_price1'}>,
<Axes: title={'center': 'price_sensitive'}>,
<Axes: title={'center': 'hoteluv'}>,
<Axes: title={'center': 'businessrate_pre'}>],
[<Axes: title={'center': 'ordernum_oneyear'}>,
<Axes: title={'center': 'cr_pre'}>,
<Axes: title={'center': 'avgprice'}>,
<Axes: title={'center': 'lowestprice'}>,
<Axes: title={'center': 'firstorder_bu'}>,
<Axes: title={'center': 'customereval_pre2'}>,
<Axes: title={'center': 'delta_price2'}>],
[<Axes: title={'center': 'commentnums_pre'}>,
<Axes: title={'center': 'customer_value_profit'}>,
<Axes: title={'center': 'commentnums_pre2'}>,
<Axes: title={'center': 'cancelrate_pre'}>,
<Axes: title={'center': 'novoters_pre2'}>,
<Axes: title={'center': 'novoters_pre'}>,
<Axes: title={'center': 'ctrip_profits'}>],
[<Axes: title={'center': 'deltaprice_pre2_t1'}>,
<Axes: title={'center': 'lowestprice_pre'}>,
<Axes: title={'center': 'uv_pre'}>,
<Axes: title={'center': 'uv_pre2'}>,
<Axes: title={'center': 'lowestprice_pre2'}>,
<Axes: title={'center': 'lasthtlordergap'}>,
<Axes: title={'center': 'businessrate_pre2'}>],
[<Axes: title={'center': 'cityuvs'}>,
<Axes: title={'center': 'cityorders'}>,
<Axes: title={'center': 'lastpvgap'}>,
<Axes: title={'center': 'cr'}>, <Axes: title={'center': 'sid'}>,
<Axes: title={'center': 'visitnum_oneyear'}>,
<Axes: title={'center': 'h'}>],
[<Axes: title={'center': 'day_advanced'}>,
<Axes: title={'center': 'arrival_weekday'}>,
<Axes: title={'center': 'is_arrival_weekend'}>, <Axes: >,
<Axes: >, <Axes: >, <Axes: >]], dtype=object)
去除重复字段?¶
In [ ]:
train_data_rawdf.drop_duplicates(inplace=True)
train_data_rawdf.shape
test_data_rawdf.drop_duplicates(inplace=True)
test_data_rawdf.shape
Out[ ]:
(435075, 51)
In [ ]:
isSame(train_data_rawdf,test_data_rawdf)
字段不完全一样。 在train_data中独有的字段: Index(['label'], dtype='object') 在test_data中独有的字段: Index([], dtype='object')
缺失值删除(缺失值比例大于40%)¶
In [ ]:
# def delete_data(rawdf):
# print('原来数据维度是:{}'.format(rawdf.shape))
# def nan_drop(df, axi, rate=0.5):
# thresh = df.shape[1-axi] * rate
# df.dropna(axis=axi, thresh=thresh, inplace=True)
# # 删除缺失值比例大于40%的列
# nan_drop(rawdf, axi=1, rate=0.6)
# print('删除缺失率较多的字段后的维度是:{}'.format(rawdf.shape))
# delete_data(train_data_rawdf)
# delete_data(test_data_rawdf)
def delete_data(rawdf):
print('原来数据维度是:{}'.format(rawdf.shape))
def nan_drop(df, axi, rate=0.5):
original_columns = df.columns.tolist() # 保存原始列名
thresh = df.shape[1-axi] * rate
df.dropna(axis=axi, thresh=thresh, inplace=True)
# 找出被删除的列
deleted_columns = [col for col in original_columns if col not in df.columns]
return deleted_columns
# 删除缺失值比例大于60%的列
deleted_columns = nan_drop(rawdf, axi=1, rate=0.6)
print('删除缺失率较多的字段后的维度是:{}'.format(rawdf.shape))
print('被删除的字段有:{}'.format(deleted_columns))
# 假设 train_data_rawdf 是您的原始 DataFrame 对象
delete_data(train_data_rawdf)
原来数据维度是:(689945, 46) 删除缺失率较多的字段后的维度是:(689945, 46) 被删除的字段有:[]
In [ ]:
# 指定要删除的列名列表
columns_to_drop = [
'decisionhabit_user', 'historyvisit_7ordernum',
'historyvisit_totalordernum', 'historyvisit_avghotelnum',
'historyvisit_visit_detailpagenum', 'firstorder_bu'
]
# 删除指定的列
test_data_rawdf.drop(columns=columns_to_drop, axis=1, inplace=True)
# 打印删除列后的DataFrame维度以确认列已被删除
print('删除指定字段后的维度是:{}'.format(test_data_rawdf.shape))
删除指定字段后的维度是:(435075, 45)
In [ ]:
isSame(train_data_rawdf,test_data_rawdf)
字段不完全一样。 在train_data中独有的字段: Index(['label'], dtype='object') 在test_data中独有的字段: Index([], dtype='object')
缺失值填充(对于缺失值小于80%)¶
查看含有缺数的数据的偏态¶
In [ ]:
train_data_rawdf.skew()[train_data_rawdf.isnull().mean(0)>0].sort_values()
# 对于缺失值小于80%的字段,结合数据分布形态填充。
# 填充:
# 服从正态分布的使用均值填充,
# 呈偏态分布的,使用中位数填充。
# 由数据偏态信息可知,对starprefer、businessrate_pre2、businessrate_pre、customereval_pre2、
# ordercanceledprecent、consuming_capacity、cancelrate_pre进行均值填充。 对其他缺失的列进行中位数填充。
# test_data_rawdf.skew()[train_data_rawdf.isnull().mean(0)>0].sort_values()
Out[ ]:
delta_price1 -48.892476 delta_price2 -16.301581 starprefer -0.361712 businessrate_pre2 -0.109048 businessrate_pre -0.042611 customereval_pre2 -0.033801 ordercanceledprecent 0.655980 consuming_capacity 1.029861 cancelrate_pre 1.262474 deltaprice_pre2_t1 1.457223 price_sensitive 1.504168 lasthtlordergap 1.536367 cr_pre 1.776279 cityuvs 2.034204 cityorders 2.117058 landhalfhours 2.178247 avgprice 2.700013 hotelcr 2.830497 cancelrate 3.707977 lastpvgap 3.862664 uv_pre2 3.947333 uv_pre 4.196402 cr 4.483618 hoteluv 4.504515 ordernum_oneyear 4.641911 commentnums_pre2 4.768733 novoters_pre2 4.777097 novoters_pre 5.220587 commentnums_pre 5.302130 novoters 5.388156 commentnums 5.516973 ordercanncelednum 7.527943 ctrip_profits 9.856848 customer_value_profit 12.304766 lowestprice_pre2 21.554698 visitnum_oneyear 23.299890 lowestprice_pre 50.064034 lowestprice 78.040419 dtype: float64
In [ ]:
# 服从正态分布的使用均值填充,
# 呈偏态分布的,使用中位数填充。
def nan_fill(df):
filter_mean = ["businessrate_pre2","cancelrate_pre","businessrate_pre",'starprefer',
'customereval_pre2','ordercanceledprecent','consuming_capacity']
for col in df.columns:
if col in filter_mean:
df[col] = df[col].fillna(df[col].mean())
else:
df[col] = df[col].fillna(df[col].median())
return df
train_data_rawdf = nan_fill(train_data_rawdf)
test_data_rawdf = nan_fill(test_data_rawdf)
查看填充结果¶
In [ ]:
train_data_rawdf.skew()[train_data_rawdf.isnull().mean(0)>0].sort_values() #0
test_data_rawdf.skew()[test_data_rawdf.isnull().mean(0)>0].sort_values() #0
Out[ ]:
Series([], dtype: float64)
异常值-负数的处理¶
In [ ]:
# delta_price1(用户偏好价格-24h浏览最多酒店价格)
# delta_price2(用户偏好价格-24h浏览酒店平均价格)
# lowestprice(当前酒店可定最低价格)三者理论上酒店价格不可能为负,
# 填充:
# 并且由数据分布比较集中,因此负值采取中位数处理。
# customer_value_profit(客户价值_近1年)、ctrip_profits(客户价值)也不应该为负值,
# 分布较为分散,因此将其填充为0
def filter_minus_data(rawdf):
filter_one=['customer_value_profit','ctrip_profits']
filter_two=['delta_price1','delta_price2','lowestprice']
for f in filter_one:
if f in rawdf:
rawdf.loc[rawdf[f]<0, f] = 0
for f in filter_two:
if f in rawdf:
rawdf.loc[rawdf[f]<0, f] = rawdf[f].median()
filter_minus_data(train_data_rawdf)
filter_minus_data(test_data_rawdf)
检验负数处理情况(查看最小值是否有负数)¶
In [ ]:
new_1= train_data_rawdf[['lowestprice']].describe()
new_1
# new_1.to_excel('new_1.xlsx')
# df_importance.to_excel('feature_importance.xlsx', index=False)
# train_data_rawdf.describe().T
Out[ ]:
| lowestprice | |
|---|---|
| count | 689945.000000 |
| mean | 318.459732 |
| std | 574.977038 |
| min | 1.000000 |
| 25% | 117.000000 |
| 50% | 200.000000 |
| 75% | 379.000000 |
| max | 100000.000000 |
In [ ]:
test_data_rawdf[['customer_value_profit','ctrip_profits','delta_price1','delta_price2','lowestprice']].describe()
Out[ ]:
| customer_value_profit | ctrip_profits | delta_price1 | delta_price2 | lowestprice | |
|---|---|---|---|---|---|
| count | 435075.000000 | 435075.000000 | 435075.00000 | 435075.000000 | 435075.000000 |
| mean | 2.017994 | 2.702271 | 109.60285 | 92.344403 | 331.389202 |
| std | 4.031082 | 5.626372 | 136.10511 | 128.327708 | 602.492074 |
| min | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 1.000000 |
| 25% | 1.292000 | 1.693000 | 77.00000 | 62.000000 | 120.000000 |
| 50% | 1.292000 | 1.693000 | 77.00000 | 62.000000 | 214.000000 |
| 75% | 1.292000 | 1.693000 | 77.00000 | 62.000000 | 397.000000 |
| max | 167.220000 | 309.153000 | 6074.00000 | 6071.000000 | 100000.000000 |
极值处理-盖帽法¶
处理数据¶
In [ ]:
#盖帽法:某连续变量6西格玛之外的记录用正负3西格玛值替代,
# 一般正负3西格玛包含99%的数据,所以默认凡小于百分之一分位数和大于百分之九十九分位数的值用百分之一分位数和百分之九十九分位数代替,俗称盖帽法
# 盖帽法函数
def cap_values(series, lower_quantile=0.01, upper_quantile=0.99):
q_low = series.quantile(lower_quantile)
q_high = series.quantile(upper_quantile)
return np.minimum(np.maximum(series, q_low), q_high)
# 应用盖帽法到DataFrame的每一列
def apply_cap_to_df(df, columns):
for col in columns:
df[col] = cap_values(df[col])
return df
# 假设rawdf是你的DataFrame
# rawdf = pd.DataFrame(...)
# 需要处理的列名列表
# columns_to_cap = ['column1', 'column2', 'column3'] # 根据实际情况替换列名
# 应用盖帽法
train_data_rawdf = apply_cap_to_df(train_data_rawdf, train_data_rawdf.columns)
test_data_rawdf = apply_cap_to_df(test_data_rawdf, test_data_rawdf.columns)
# test_data_rawdf.head(10)
# def get_percentile_data(data1):
# # 初始化一个空的DataFrame来存储结果
# result_df = pd.DataFrame()
# # 遍历data1中的每一列
# for column in data1.columns:
# # 计算第1百分位数和第99百分位数
# p1 = np.percentile(data1[column], 1)
# p99 = np.percentile(data1[column], 99)
# # 找出小于第1百分位数的值
# values_below_p1 = data1[data1[column] < p1]
# # 找出大于第99百分位数的值
# values_above_p99 = data1[data1[column] > p99]
# # 将结果合并到result_df中
# result_df = pd.concat([result_df, values_below_p1, values_above_p99])
# # 删除重复的行并重置索引
# result_df = result_df.drop_duplicates().reset_index(drop=True)
# return result_df
# ret = get_percentile_data(train_data_rawdf)
再次查看数据分布偏态情况(skew)¶
In [ ]:
# 查看表现
train_data_rawdf.skew().sort_values()
# test_data_rawdf.skew().sort_values()
Out[ ]:
h -0.665606 arrival_weekday -0.186567 businessrate_pre2 -0.136561 starprefer -0.126118 businessrate_pre -0.072562 customereval_pre2 -0.043090 deltaprice_pre2_t1 -0.032096 ordercanceledprecent 0.814217 hotelcr 0.829181 cr_pre 0.841635 is_arrival_weekend 0.984591 label 1.010888 consuming_capacity 1.279602 cancelrate_pre 1.327993 sampleid 1.339944 iforderpv_24h 1.549817 novoters_pre2 1.982481 commentnums_pre2 1.998712 cityuvs 2.045279 cancelrate 2.066955 avgprice 2.076221 day_advanced 2.081788 price_sensitive 2.108583 lasthtlordergap 2.189324 lowestprice_pre2 2.210016 cityorders 2.212808 landhalfhours 2.243412 novoters 2.263438 commentnums 2.354108 novoters_pre 2.444956 commentnums_pre 2.471846 lowestprice_pre 2.479717 lowestprice 2.596511 sid 2.686568 cr 2.752425 delta_price1 2.897290 delta_price2 3.022332 uv_pre2 3.116940 ordernum_oneyear 3.132565 uv_pre 3.287560 hoteluv 3.397452 ordercanncelednum 3.649391 customer_value_profit 3.653897 ctrip_profits 3.680904 lastpvgap 3.773843 visitnum_oneyear 8.120348 dtype: float64
In [ ]:
train_data_rawdf.hist(figsize=(20,20))
# plt.savefig('./images/data_distribution_raw.png')
Out[ ]:
array([[<Axes: title={'center': 'label'}>,
<Axes: title={'center': 'sampleid'}>,
<Axes: title={'center': 'iforderpv_24h'}>,
<Axes: title={'center': 'hotelcr'}>,
<Axes: title={'center': 'ordercanceledprecent'}>,
<Axes: title={'center': 'landhalfhours'}>,
<Axes: title={'center': 'ordercanncelednum'}>],
[<Axes: title={'center': 'commentnums'}>,
<Axes: title={'center': 'starprefer'}>,
<Axes: title={'center': 'novoters'}>,
<Axes: title={'center': 'consuming_capacity'}>,
<Axes: title={'center': 'cancelrate'}>,
<Axes: title={'center': 'delta_price1'}>,
<Axes: title={'center': 'price_sensitive'}>],
[<Axes: title={'center': 'hoteluv'}>,
<Axes: title={'center': 'businessrate_pre'}>,
<Axes: title={'center': 'ordernum_oneyear'}>,
<Axes: title={'center': 'cr_pre'}>,
<Axes: title={'center': 'avgprice'}>,
<Axes: title={'center': 'lowestprice'}>,
<Axes: title={'center': 'customereval_pre2'}>],
[<Axes: title={'center': 'delta_price2'}>,
<Axes: title={'center': 'commentnums_pre'}>,
<Axes: title={'center': 'customer_value_profit'}>,
<Axes: title={'center': 'commentnums_pre2'}>,
<Axes: title={'center': 'cancelrate_pre'}>,
<Axes: title={'center': 'novoters_pre2'}>,
<Axes: title={'center': 'novoters_pre'}>],
[<Axes: title={'center': 'ctrip_profits'}>,
<Axes: title={'center': 'deltaprice_pre2_t1'}>,
<Axes: title={'center': 'lowestprice_pre'}>,
<Axes: title={'center': 'uv_pre'}>,
<Axes: title={'center': 'uv_pre2'}>,
<Axes: title={'center': 'lowestprice_pre2'}>,
<Axes: title={'center': 'lasthtlordergap'}>],
[<Axes: title={'center': 'businessrate_pre2'}>,
<Axes: title={'center': 'cityuvs'}>,
<Axes: title={'center': 'cityorders'}>,
<Axes: title={'center': 'lastpvgap'}>,
<Axes: title={'center': 'cr'}>, <Axes: title={'center': 'sid'}>,
<Axes: title={'center': 'visitnum_oneyear'}>],
[<Axes: title={'center': 'h'}>,
<Axes: title={'center': 'day_advanced'}>,
<Axes: title={'center': 'arrival_weekday'}>,
<Axes: title={'center': 'is_arrival_weekend'}>, <Axes: >,
<Axes: >, <Axes: >]], dtype=object)
再次检查数据-箱型图(极值和负值)¶
In [ ]:
# def show_box(rawdf):
# plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
# plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# # 遍历所有列
# for column in rawdf.columns:
# plt.figure(figsize=(4, 8), dpi=100)
# plt.boxplot(rawdf[column].dropna().values) # 使用列名来访问数据
# plt.xlabel(column)
# plt.show()
# # 假设train_data_rawdf是你的DataFrame
# show_box(train_data_rawdf)
# show_box(test_data_rawdf)
def show_box(rawdf):
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 计算需要多少行来放置所有的箱线图
num_rows = len(rawdf.columns) // 2 + (len(rawdf.columns) % 2 > 0) # 向上取整
# 创建一个足够大的画布,每个子图的大小为4x8
plt.figure(figsize=(10, num_rows * 4), dpi=100)
# 遍历所有列,并在子图上绘制箱线图
for i, column in enumerate(rawdf.columns):
ax = plt.subplot(num_rows, 13, i + 1) # 创建子图
ax.boxplot(rawdf[column].dropna().values) # 使用列名来访问数据
ax.set_xlabel(column,rotation=90) # 设置x轴标签
# 旋转日期标签,使其竖直显示
# plt.xticks(rotation=45)
# 自动格式化x轴日期间隔
# plt.gcf().autofmt_xdate()
# 调整子图间距
plt.tight_layout()
plt.show()
# 假设train_data_rawdf和test_data_rawdf是你的DataFrame
show_box(train_data_rawdf)
# show_box(test_data_rawdf)
梳理列与列之间的关系¶
相关性¶
In [ ]:
# 查看数据分布图
train_data_rawdf.hist(figsize=(20,20))
# plt.savefig('./images/data_distribution_raw.png')
Out[ ]:
array([[<Axes: title={'center': 'label'}>,
<Axes: title={'center': 'sampleid'}>,
<Axes: title={'center': 'iforderpv_24h'}>,
<Axes: title={'center': 'hotelcr'}>,
<Axes: title={'center': 'ordercanceledprecent'}>,
<Axes: title={'center': 'landhalfhours'}>,
<Axes: title={'center': 'ordercanncelednum'}>],
[<Axes: title={'center': 'commentnums'}>,
<Axes: title={'center': 'starprefer'}>,
<Axes: title={'center': 'novoters'}>,
<Axes: title={'center': 'consuming_capacity'}>,
<Axes: title={'center': 'cancelrate'}>,
<Axes: title={'center': 'delta_price1'}>,
<Axes: title={'center': 'price_sensitive'}>],
[<Axes: title={'center': 'hoteluv'}>,
<Axes: title={'center': 'businessrate_pre'}>,
<Axes: title={'center': 'ordernum_oneyear'}>,
<Axes: title={'center': 'cr_pre'}>,
<Axes: title={'center': 'avgprice'}>,
<Axes: title={'center': 'lowestprice'}>,
<Axes: title={'center': 'customereval_pre2'}>],
[<Axes: title={'center': 'delta_price2'}>,
<Axes: title={'center': 'commentnums_pre'}>,
<Axes: title={'center': 'customer_value_profit'}>,
<Axes: title={'center': 'commentnums_pre2'}>,
<Axes: title={'center': 'cancelrate_pre'}>,
<Axes: title={'center': 'novoters_pre2'}>,
<Axes: title={'center': 'novoters_pre'}>],
[<Axes: title={'center': 'ctrip_profits'}>,
<Axes: title={'center': 'deltaprice_pre2_t1'}>,
<Axes: title={'center': 'lowestprice_pre'}>,
<Axes: title={'center': 'uv_pre'}>,
<Axes: title={'center': 'uv_pre2'}>,
<Axes: title={'center': 'lowestprice_pre2'}>,
<Axes: title={'center': 'lasthtlordergap'}>],
[<Axes: title={'center': 'businessrate_pre2'}>,
<Axes: title={'center': 'cityuvs'}>,
<Axes: title={'center': 'cityorders'}>,
<Axes: title={'center': 'lastpvgap'}>,
<Axes: title={'center': 'cr'}>, <Axes: title={'center': 'sid'}>,
<Axes: title={'center': 'visitnum_oneyear'}>],
[<Axes: title={'center': 'h'}>,
<Axes: title={'center': 'day_advanced'}>,
<Axes: title={'center': 'arrival_weekday'}>,
<Axes: title={'center': 'is_arrival_weekend'}>, <Axes: >,
<Axes: >, <Axes: >]], dtype=object)
In [ ]:
def corr_user1(rawdf):
# missing_columns = [col for col in user_features if col not in rawdf.columns]
# copy_user_features = user_features.copy()
# copy_user_features = [col for col in copy_user_features if col not in missing_columns]
# print(copy_user_features)
mat = rawdf[rawdf.columns].corr()
# 绘制用户特征的相关性矩阵热度图
fig, ax = plt.subplots(figsize=(18*2,12*2))
sns.heatmap(mat, xticklabels=True, yticklabels=True, square=False, linewidths=.5, annot=True, cmap="YlGnBu")
plt.show()
return mat
train_data_rawdf_mat = corr_user1(train_data_rawdf)
train_data_rawdf_mat
Out[ ]:
| label | sampleid | iforderpv_24h | hotelcr | ordercanceledprecent | landhalfhours | ordercanncelednum | commentnums | starprefer | novoters | consuming_capacity | cancelrate | delta_price1 | price_sensitive | hoteluv | businessrate_pre | ordernum_oneyear | cr_pre | avgprice | lowestprice | customereval_pre2 | delta_price2 | commentnums_pre | customer_value_profit | commentnums_pre2 | cancelrate_pre | novoters_pre2 | novoters_pre | ctrip_profits | deltaprice_pre2_t1 | lowestprice_pre | uv_pre | uv_pre2 | lowestprice_pre2 | lasthtlordergap | businessrate_pre2 | cityuvs | cityorders | lastpvgap | cr | sid | visitnum_oneyear | h | day_advanced | arrival_weekday | is_arrival_weekend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| label | 1.000000 | -0.000492 | 0.110308 | 0.121993 | -0.005425 | 0.030844 | 0.110617 | 0.002726 | -0.006543 | 0.006997 | -0.024266 | 0.013711 | 0.026361 | 0.018370 | -0.052476 | 0.114840 | 0.150235 | 0.119472 | -0.016352 | -0.065190 | -0.027011 | 0.028846 | 0.000620 | 0.091545 | -0.004073 | 0.018038 | 0.001309 | 0.005137 | 0.088149 | 0.007178 | -0.055868 | -0.055836 | -0.063700 | -0.067297 | -0.058852 | 0.131459 | 0.101187 | 0.102336 | 0.010801 | 0.184888 | 0.016431 | -0.049722 | -0.077728 | -0.153983 | 0.002958 | -0.009220 |
| sampleid | -0.000492 | 1.000000 | -0.000423 | 0.000542 | -0.003193 | 0.001002 | -0.000505 | 0.001947 | -0.001072 | 0.002116 | -0.000317 | 0.001320 | -0.000902 | 0.001268 | 0.000178 | 0.003181 | 0.001463 | 0.000043 | -0.000315 | 0.001775 | 0.001034 | -0.000673 | 0.000614 | 0.002863 | 0.001695 | -0.000915 | 0.002169 | 0.000832 | 0.001199 | 0.000446 | 0.001802 | -0.000657 | -0.000835 | 0.001387 | -0.001727 | 0.001687 | 0.000799 | 0.000759 | 0.001157 | 0.000509 | 0.002542 | 0.000750 | 0.001812 | 0.000224 | -0.001522 | 0.001464 |
| iforderpv_24h | 0.110308 | -0.000423 | 1.000000 | 0.024071 | 0.012170 | 0.233447 | -0.015206 | 0.013740 | -0.025380 | 0.015740 | -0.029280 | 0.012095 | -0.022466 | -0.022761 | 0.002599 | 0.009458 | -0.014624 | 0.061656 | -0.040083 | -0.041892 | -0.031826 | -0.026489 | 0.044817 | -0.029591 | 0.032517 | -0.028610 | 0.036947 | 0.049872 | -0.025420 | 0.006945 | -0.037532 | 0.019053 | 0.009943 | -0.044412 | -0.017532 | 0.012048 | 0.039251 | 0.037020 | -0.005508 | -0.000848 | -0.036166 | 0.073917 | -0.012849 | -0.011125 | 0.008014 | 0.007079 |
| hotelcr | 0.121993 | 0.000542 | 0.024071 | 1.000000 | 0.011143 | -0.009319 | 0.070682 | 0.077973 | -0.032843 | 0.111510 | -0.096958 | 0.145374 | -0.007506 | -0.027991 | -0.195894 | 0.393174 | 0.092868 | 0.455088 | -0.087715 | -0.212134 | -0.018209 | 0.007050 | -0.042322 | 0.008420 | -0.057016 | 0.045860 | -0.037198 | -0.028255 | 0.011452 | 0.025691 | -0.149879 | -0.193813 | -0.221810 | -0.184937 | -0.036398 | 0.462603 | 0.010997 | 0.031117 | 0.008732 | 0.104793 | 0.018775 | 0.012877 | 0.019573 | -0.138218 | -0.012310 | -0.018519 |
| ordercanceledprecent | -0.005425 | -0.003193 | 0.012170 | 0.011143 | 1.000000 | -0.003579 | 0.325672 | -0.016809 | -0.069449 | -0.017515 | -0.064174 | -0.018120 | -0.072026 | -0.033402 | -0.020664 | 0.010344 | 0.082158 | 0.012103 | -0.069562 | -0.024391 | -0.031568 | -0.073073 | -0.017312 | -0.019029 | -0.021717 | 0.027567 | -0.022887 | -0.017762 | -0.016109 | -0.010135 | -0.025645 | -0.020522 | -0.022974 | -0.028283 | 0.015938 | 0.013720 | -0.001178 | -0.000362 | -0.000853 | -0.134614 | 0.018306 | 0.006269 | -0.000564 | 0.002098 | 0.005056 | -0.001970 |
| landhalfhours | 0.030844 | 0.001002 | 0.233447 | -0.009319 | -0.003579 | 1.000000 | 0.011840 | 0.036527 | 0.007570 | 0.037715 | 0.002310 | 0.039000 | 0.010923 | -0.000761 | 0.041235 | -0.026266 | 0.024762 | 0.011789 | 0.003704 | 0.012501 | 0.036590 | 0.001920 | 0.059744 | 0.007725 | 0.056329 | -0.047855 | 0.057026 | 0.061994 | 0.006446 | 0.008655 | 0.024001 | 0.057284 | 0.051673 | 0.030444 | -0.015678 | -0.028497 | 0.045452 | 0.043500 | -0.054367 | -0.011540 | 0.039231 | 0.108321 | 0.001964 | -0.034362 | 0.008332 | 0.030208 |
| ordercanncelednum | 0.110617 | -0.000505 | -0.015206 | 0.070682 | 0.325672 | 0.011840 | 1.000000 | 0.021125 | 0.072213 | 0.022873 | 0.014271 | 0.024535 | 0.043845 | 0.007129 | -0.008243 | 0.092955 | 0.697471 | 0.071068 | 0.033386 | 0.029601 | 0.075713 | 0.039229 | 0.016432 | 0.352043 | 0.020932 | -0.015835 | 0.023322 | 0.017448 | 0.323278 | 0.007086 | 0.034699 | -0.011653 | -0.013259 | 0.038869 | -0.077737 | 0.103838 | 0.023551 | 0.027659 | -0.069852 | 0.181734 | 0.241964 | 0.043761 | -0.003672 | -0.067534 | -0.021776 | -0.018639 |
| commentnums | 0.002726 | 0.001947 | 0.013740 | 0.077973 | -0.016809 | 0.036527 | 0.021125 | 1.000000 | 0.170337 | 0.986627 | 0.172716 | 0.837859 | 0.068394 | 0.028776 | 0.682082 | -0.017608 | 0.031688 | -0.048611 | 0.169607 | 0.224027 | 0.366761 | 0.054512 | 0.415256 | 0.111180 | 0.509763 | -0.214569 | 0.511192 | 0.416398 | 0.100786 | 0.028964 | 0.220525 | 0.405871 | 0.454397 | 0.254302 | 0.014486 | -0.005398 | 0.090860 | 0.070892 | -0.017876 | -0.008408 | 0.063053 | 0.005725 | -0.002646 | 0.074934 | -0.013678 | -0.007720 |
| starprefer | -0.006543 | -0.001072 | -0.025380 | -0.032843 | -0.069449 | 0.007570 | 0.072213 | 0.170337 | 1.000000 | 0.171033 | 0.728197 | 0.182129 | 0.400740 | 0.038752 | 0.166661 | 0.017359 | 0.120942 | -0.039899 | 0.666243 | 0.259878 | 0.356788 | 0.401197 | 0.160576 | 0.385153 | 0.203028 | -0.156913 | 0.204954 | 0.160275 | 0.351759 | 0.041728 | 0.267120 | 0.160029 | 0.180710 | 0.308946 | -0.052847 | 0.012737 | 0.018577 | 0.013694 | -0.058683 | -0.007503 | 0.136853 | 0.020643 | -0.002809 | 0.031312 | -0.014619 | -0.010775 |
| novoters | 0.006997 | 0.002116 | 0.015740 | 0.111510 | -0.017515 | 0.037715 | 0.022873 | 0.986627 | 0.171033 | 1.000000 | 0.171493 | 0.853580 | 0.068881 | 0.028533 | 0.678037 | -0.004879 | 0.034418 | -0.033836 | 0.168560 | 0.219171 | 0.369402 | 0.055179 | 0.417601 | 0.112363 | 0.512748 | -0.217408 | 0.518597 | 0.422311 | 0.101811 | 0.033301 | 0.218488 | 0.403685 | 0.451455 | 0.251498 | 0.013645 | 0.009604 | 0.095031 | 0.075434 | -0.017574 | -0.005125 | 0.063840 | 0.007025 | -0.002321 | 0.071222 | -0.013623 | -0.007789 |
| consuming_capacity | -0.024266 | -0.000317 | -0.029280 | -0.096958 | -0.064174 | 0.002310 | 0.014271 | 0.172716 | 0.728197 | 0.171493 | 1.000000 | 0.186699 | 0.589737 | 0.241250 | 0.195983 | -0.035674 | 0.059060 | -0.104799 | 0.899684 | 0.305614 | 0.333177 | 0.599844 | 0.163096 | 0.459208 | 0.207075 | -0.152446 | 0.206418 | 0.161232 | 0.420644 | 0.062676 | 0.312567 | 0.189766 | 0.213273 | 0.362639 | -0.000045 | -0.046877 | 0.041321 | 0.031740 | -0.043740 | -0.060208 | 0.130227 | 0.017015 | 0.001819 | 0.083663 | -0.003290 | -0.005326 |
| cancelrate | 0.013711 | 0.001320 | 0.012095 | 0.145374 | -0.018120 | 0.039000 | 0.024535 | 0.837859 | 0.182129 | 0.853580 | 0.186699 | 1.000000 | 0.075817 | 0.030782 | 0.756565 | 0.030727 | 0.037084 | -0.022484 | 0.184900 | 0.246753 | 0.388280 | 0.060310 | 0.408483 | 0.124846 | 0.511110 | -0.202260 | 0.515620 | 0.413909 | 0.115460 | 0.042032 | 0.239832 | 0.428912 | 0.480678 | 0.276399 | 0.013571 | 0.048090 | 0.151089 | 0.127710 | -0.018667 | -0.002253 | 0.069283 | 0.004450 | -0.004663 | 0.067196 | -0.014954 | -0.008705 |
| delta_price1 | 0.026361 | -0.000902 | -0.022466 | -0.007506 | -0.072026 | 0.010923 | 0.043845 | 0.068394 | 0.400740 | 0.068881 | 0.589737 | 0.075817 | 1.000000 | 0.217116 | 0.050365 | 0.019380 | 0.060427 | -0.002964 | 0.707538 | 0.074178 | 0.108193 | 0.909763 | 0.071632 | 0.334857 | 0.089728 | -0.057887 | 0.091642 | 0.072447 | 0.320884 | 0.044572 | -0.095753 | 0.048194 | 0.058305 | 0.017120 | 0.077568 | 0.023504 | 0.041702 | 0.039969 | -0.022239 | 0.040365 | 0.137399 | 0.019858 | 0.006350 | 0.009858 | -0.011647 | -0.012393 |
| price_sensitive | 0.018370 | 0.001268 | -0.022761 | -0.027991 | -0.033402 | -0.000761 | 0.007129 | 0.028776 | 0.038752 | 0.028533 | 0.241250 | 0.030782 | 0.217116 | 1.000000 | 0.032767 | -0.015988 | 0.019737 | -0.025215 | 0.245152 | 0.060616 | 0.058495 | 0.224172 | 0.029470 | 0.154500 | 0.034977 | -0.027585 | 0.034775 | 0.029307 | 0.120446 | 0.016819 | 0.061849 | 0.030434 | 0.034278 | 0.073014 | 0.017887 | -0.021591 | 0.020399 | 0.016766 | -0.008366 | 0.056799 | 0.053669 | 0.020829 | -0.000671 | 0.037552 | 0.005733 | -0.000249 |
| hoteluv | -0.052476 | 0.000178 | 0.002599 | -0.195894 | -0.020664 | 0.041235 | -0.008243 | 0.682082 | 0.166661 | 0.678037 | 0.195983 | 0.756565 | 0.050365 | 0.032767 | 1.000000 | -0.206681 | -0.006906 | -0.201410 | 0.192592 | 0.372250 | 0.401654 | 0.032113 | 0.400708 | 0.094996 | 0.508152 | -0.255675 | 0.497864 | 0.397479 | 0.084697 | -0.011403 | 0.310592 | 0.611926 | 0.693862 | 0.364769 | 0.030016 | -0.208543 | 0.059585 | 0.029337 | -0.020363 | -0.045512 | 0.051564 | 0.000132 | -0.011013 | 0.136684 | -0.013809 | -0.002814 |
| businessrate_pre | 0.114840 | 0.003181 | 0.009458 | 0.393174 | 0.010344 | -0.026266 | 0.092955 | -0.017608 | 0.017359 | -0.004879 | -0.035674 | 0.030727 | 0.019380 | -0.015988 | -0.206681 | 1.000000 | 0.118832 | 0.521287 | -0.023756 | -0.081715 | -0.018557 | 0.028578 | -0.033132 | 0.063066 | -0.043829 | 0.113483 | -0.022617 | -0.017423 | 0.066384 | 0.083459 | -0.093702 | -0.287193 | -0.288367 | -0.116378 | -0.041595 | 0.839675 | 0.170056 | 0.183577 | 0.008094 | 0.118519 | 0.042884 | 0.010181 | 0.015272 | -0.164622 | -0.022992 | -0.031647 |
| ordernum_oneyear | 0.150235 | 0.001463 | -0.014624 | 0.092868 | 0.082158 | 0.024762 | 0.697471 | 0.031688 | 0.120942 | 0.034418 | 0.059060 | 0.037084 | 0.060427 | 0.019737 | -0.006906 | 0.118832 | 1.000000 | 0.092990 | 0.074879 | 0.039091 | 0.099951 | 0.055956 | 0.024877 | 0.589498 | 0.032441 | -0.028284 | 0.036702 | 0.026763 | 0.513736 | 0.017308 | 0.044471 | -0.011691 | -0.012170 | 0.049793 | -0.257930 | 0.133643 | 0.030157 | 0.034742 | -0.119503 | 0.282769 | 0.316929 | 0.070454 | -0.007748 | -0.081668 | -0.039014 | -0.029886 |
| cr_pre | 0.119472 | 0.000043 | 0.061656 | 0.455088 | 0.012103 | 0.011789 | 0.071068 | -0.048611 | -0.039899 | -0.033836 | -0.104799 | -0.022484 | -0.002964 | -0.025215 | -0.201410 | 0.521287 | 0.092990 | 1.000000 | -0.094573 | -0.157166 | -0.005069 | 0.008205 | 0.051888 | 0.004601 | -0.017631 | -0.067697 | 0.013853 | 0.084736 | 0.008191 | 0.030638 | -0.215729 | -0.224403 | -0.249726 | -0.230065 | -0.036883 | 0.531262 | 0.012648 | 0.033255 | 0.006104 | 0.104715 | 0.017455 | 0.020455 | 0.014137 | -0.142703 | -0.012612 | -0.020682 |
| avgprice | -0.016352 | -0.000315 | -0.040083 | -0.087715 | -0.069562 | 0.003704 | 0.033386 | 0.169607 | 0.666243 | 0.168560 | 0.899684 | 0.184900 | 0.707538 | 0.245152 | 0.192592 | -0.023756 | 0.074879 | -0.094573 | 1.000000 | 0.314600 | 0.337313 | 0.724924 | 0.157997 | 0.490475 | 0.200693 | -0.151754 | 0.200586 | 0.156171 | 0.456162 | 0.066985 | 0.321117 | 0.184727 | 0.207600 | 0.372497 | 0.030867 | -0.032136 | 0.041208 | 0.031936 | -0.045533 | -0.023160 | 0.165819 | 0.026597 | 0.003963 | 0.083902 | -0.006420 | -0.010198 |
| lowestprice | -0.065190 | 0.001775 | -0.041892 | -0.212134 | -0.024391 | 0.012501 | 0.029601 | 0.224027 | 0.259878 | 0.219171 | 0.305614 | 0.246753 | 0.074178 | 0.060616 | 0.372250 | -0.081715 | 0.039091 | -0.157166 | 0.314600 | 1.000000 | 0.418775 | 0.045705 | 0.197605 | 0.189463 | 0.255902 | -0.176545 | 0.251468 | 0.194376 | 0.176076 | 0.044972 | 0.482142 | 0.293774 | 0.336927 | 0.565141 | 0.027427 | -0.100605 | 0.042023 | 0.023651 | -0.027053 | -0.021063 | 0.109760 | 0.023946 | -0.006463 | 0.130505 | 0.015271 | 0.011663 |
| customereval_pre2 | -0.027011 | 0.001034 | -0.031826 | -0.018209 | -0.031568 | 0.036590 | 0.075713 | 0.366761 | 0.356788 | 0.369402 | 0.333177 | 0.388280 | 0.108193 | 0.058495 | 0.401654 | -0.018557 | 0.099951 | -0.005069 | 0.337313 | 0.418775 | 1.000000 | 0.081975 | 0.434567 | 0.238933 | 0.542253 | -0.442958 | 0.545988 | 0.436519 | 0.220543 | 0.025923 | 0.516774 | 0.475066 | 0.533003 | 0.592722 | 0.012611 | -0.019580 | -0.055406 | -0.067485 | -0.049631 | 0.003828 | 0.147123 | 0.036845 | -0.014246 | 0.107125 | -0.022543 | -0.011237 |
| delta_price2 | 0.028846 | -0.000673 | -0.026489 | 0.007050 | -0.073073 | 0.001920 | 0.039229 | 0.054512 | 0.401197 | 0.055179 | 0.599844 | 0.060310 | 0.909763 | 0.224172 | 0.032113 | 0.028578 | 0.055956 | 0.008205 | 0.724924 | 0.045705 | 0.081975 | 1.000000 | 0.057185 | 0.337208 | 0.068761 | -0.052914 | 0.071564 | 0.058056 | 0.322430 | 0.034209 | -0.027662 | 0.034070 | 0.035790 | -0.031884 | 0.081158 | 0.032584 | 0.030479 | 0.031175 | -0.017904 | 0.043167 | 0.132286 | 0.017452 | 0.007739 | -0.000622 | -0.012919 | -0.012592 |
| commentnums_pre | 0.000620 | 0.000614 | 0.044817 | -0.042322 | -0.017312 | 0.059744 | 0.016432 | 0.415256 | 0.160576 | 0.417601 | 0.163096 | 0.408483 | 0.071632 | 0.029470 | 0.400708 | -0.033132 | 0.024877 | 0.051888 | 0.157997 | 0.197605 | 0.434567 | 0.057185 | 1.000000 | 0.099698 | 0.821527 | -0.310531 | 0.822742 | 0.986821 | 0.091443 | 0.038209 | 0.231631 | 0.684716 | 0.600708 | 0.261593 | 0.016245 | -0.018595 | 0.081078 | 0.061015 | -0.014918 | -0.011920 | 0.054643 | 0.014873 | 0.001200 | 0.085242 | -0.016114 | -0.007082 |
| customer_value_profit | 0.091545 | 0.002863 | -0.029591 | 0.008420 | -0.019029 | 0.007725 | 0.352043 | 0.111180 | 0.385153 | 0.112363 | 0.459208 | 0.124846 | 0.334857 | 0.154500 | 0.094996 | 0.063066 | 0.589498 | 0.004601 | 0.490475 | 0.189463 | 0.238933 | 0.337208 | 0.099698 | 1.000000 | 0.126725 | -0.093388 | 0.129934 | 0.100088 | 0.846496 | 0.051508 | 0.198176 | 0.089146 | 0.100517 | 0.227226 | -0.140166 | 0.065567 | 0.046860 | 0.044189 | -0.095766 | 0.183976 | 0.282234 | 0.060931 | -0.004408 | -0.004478 | -0.026940 | -0.024283 |
| commentnums_pre2 | -0.004073 | 0.001695 | 0.032517 | -0.057016 | -0.021717 | 0.056329 | 0.020932 | 0.509763 | 0.203028 | 0.512748 | 0.207075 | 0.511110 | 0.089728 | 0.034977 | 0.508152 | -0.043829 | 0.032441 | -0.017631 | 0.200693 | 0.255902 | 0.542253 | 0.068761 | 0.821527 | 0.126725 | 1.000000 | -0.329677 | 0.981899 | 0.815104 | 0.115448 | 0.045098 | 0.287523 | 0.673500 | 0.739316 | 0.332534 | 0.019501 | -0.025075 | 0.104632 | 0.078952 | -0.021515 | -0.014350 | 0.072572 | 0.012653 | -0.003364 | 0.106090 | -0.019686 | -0.007454 |
| cancelrate_pre | 0.018038 | -0.000915 | -0.028610 | 0.045860 | 0.027567 | -0.047855 | -0.015835 | -0.214569 | -0.156913 | -0.217408 | -0.152446 | -0.202260 | -0.057887 | -0.027585 | -0.255675 | 0.113483 | -0.028284 | -0.067697 | -0.151754 | -0.176545 | -0.442958 | -0.052914 | -0.310531 | -0.093388 | -0.329677 | 1.000000 | -0.340087 | -0.322329 | -0.084365 | -0.009206 | -0.220220 | -0.320180 | -0.330733 | -0.237741 | -0.011567 | 0.144381 | 0.077926 | 0.088606 | 0.028619 | 0.013600 | -0.062432 | -0.024932 | 0.010556 | -0.089451 | 0.013312 | 0.002962 |
| novoters_pre2 | 0.001309 | 0.002169 | 0.036947 | -0.037198 | -0.022887 | 0.057026 | 0.023322 | 0.511192 | 0.204954 | 0.518597 | 0.206418 | 0.515620 | 0.091642 | 0.034775 | 0.497864 | -0.022617 | 0.036702 | 0.013853 | 0.200586 | 0.251468 | 0.545988 | 0.071564 | 0.822742 | 0.129934 | 0.981899 | -0.340087 | 1.000000 | 0.833242 | 0.118689 | 0.055052 | 0.282702 | 0.666136 | 0.727811 | 0.325686 | 0.018249 | -0.000377 | 0.108625 | 0.083775 | -0.020732 | -0.009001 | 0.073381 | 0.015749 | -0.002815 | 0.099393 | -0.020253 | -0.008146 |
| novoters_pre | 0.005137 | 0.000832 | 0.049872 | -0.028255 | -0.017762 | 0.061994 | 0.017448 | 0.416398 | 0.160275 | 0.422311 | 0.161232 | 0.413909 | 0.072447 | 0.029307 | 0.397479 | -0.017423 | 0.026763 | 0.084736 | 0.156171 | 0.194376 | 0.436519 | 0.058056 | 0.986821 | 0.100088 | 0.815104 | -0.322329 | 0.833242 | 1.000000 | 0.091892 | 0.043769 | 0.226831 | 0.681359 | 0.596813 | 0.256747 | 0.015645 | -0.002446 | 0.085365 | 0.065592 | -0.014033 | -0.008970 | 0.054241 | 0.017952 | 0.001227 | 0.081802 | -0.016665 | -0.007869 |
| ctrip_profits | 0.088149 | 0.001199 | -0.025420 | 0.011452 | -0.016109 | 0.006446 | 0.323278 | 0.100786 | 0.351759 | 0.101811 | 0.420644 | 0.115460 | 0.320884 | 0.120446 | 0.084697 | 0.066384 | 0.513736 | 0.008191 | 0.456162 | 0.176076 | 0.220543 | 0.322430 | 0.091443 | 0.846496 | 0.115448 | -0.084365 | 0.118689 | 0.091892 | 1.000000 | 0.052051 | 0.183182 | 0.079768 | 0.089452 | 0.211305 | -0.096864 | 0.070251 | 0.052718 | 0.050835 | -0.094942 | 0.159805 | 0.286329 | 0.065248 | 0.000656 | -0.013352 | -0.029033 | -0.025471 |
| deltaprice_pre2_t1 | 0.007178 | 0.000446 | 0.006945 | 0.025691 | -0.010135 | 0.008655 | 0.007086 | 0.028964 | 0.041728 | 0.033301 | 0.062676 | 0.042032 | 0.044572 | 0.016819 | -0.011403 | 0.083459 | 0.017308 | 0.030638 | 0.066985 | 0.044972 | 0.025923 | 0.034209 | 0.038209 | 0.051508 | 0.045098 | -0.009206 | 0.055052 | 0.043769 | 0.052051 | 1.000000 | 0.071691 | -0.011070 | -0.016392 | 0.080952 | -0.001456 | 0.090499 | 0.086132 | 0.085994 | -0.000505 | 0.018554 | 0.022127 | 0.001160 | 0.001640 | -0.008466 | -0.004573 | -0.004313 |
| lowestprice_pre | -0.055868 | 0.001802 | -0.037532 | -0.149879 | -0.025645 | 0.024001 | 0.034699 | 0.220525 | 0.267120 | 0.218488 | 0.312567 | 0.239832 | -0.095753 | 0.061849 | 0.310592 | -0.093702 | 0.044471 | -0.215729 | 0.321117 | 0.482142 | 0.516774 | -0.027662 | 0.231631 | 0.198176 | 0.287523 | -0.220220 | 0.282702 | 0.226831 | 0.183182 | 0.071691 | 1.000000 | 0.371156 | 0.391096 | 0.847433 | 0.028872 | -0.112667 | 0.043150 | 0.024742 | -0.024253 | -0.016253 | 0.109393 | 0.016503 | -0.005723 | 0.135596 | 0.012064 | 0.008757 |
| uv_pre | -0.055836 | -0.000657 | 0.019053 | -0.193813 | -0.020522 | 0.057284 | -0.011653 | 0.405871 | 0.160029 | 0.403685 | 0.189766 | 0.428912 | 0.048194 | 0.030434 | 0.611926 | -0.287193 | -0.011691 | -0.224403 | 0.184727 | 0.293774 | 0.475066 | 0.034070 | 0.684716 | 0.089146 | 0.673500 | -0.320180 | 0.666136 | 0.681359 | 0.079768 | -0.011070 | 0.371156 | 1.000000 | 0.899233 | 0.403663 | 0.030990 | -0.250915 | 0.047793 | 0.017647 | -0.016644 | -0.049527 | 0.045093 | 0.006259 | -0.006515 | 0.149208 | -0.018078 | -0.004403 |
| uv_pre2 | -0.063700 | -0.000835 | 0.009943 | -0.221810 | -0.022974 | 0.051673 | -0.013259 | 0.454397 | 0.180710 | 0.451455 | 0.213273 | 0.480678 | 0.058305 | 0.034278 | 0.693862 | -0.288367 | -0.012170 | -0.249726 | 0.207600 | 0.336927 | 0.533003 | 0.035790 | 0.600708 | 0.100517 | 0.739316 | -0.330733 | 0.727811 | 0.596813 | 0.089452 | -0.016392 | 0.391096 | 0.899233 | 1.000000 | 0.456633 | 0.034105 | -0.282605 | 0.056401 | 0.021987 | -0.020083 | -0.053981 | 0.051593 | 0.002284 | -0.012470 | 0.166380 | -0.018840 | -0.002906 |
| lowestprice_pre2 | -0.067297 | 0.001387 | -0.044412 | -0.184937 | -0.028283 | 0.030444 | 0.038869 | 0.254302 | 0.308946 | 0.251498 | 0.362639 | 0.276399 | 0.017120 | 0.073014 | 0.364769 | -0.116378 | 0.049793 | -0.230065 | 0.372497 | 0.565141 | 0.592722 | -0.031884 | 0.261593 | 0.227226 | 0.332534 | -0.237741 | 0.325686 | 0.256747 | 0.211305 | 0.080952 | 0.847433 | 0.403663 | 0.456633 | 1.000000 | 0.031635 | -0.142615 | 0.055594 | 0.032771 | -0.030310 | -0.020870 | 0.128235 | 0.021339 | -0.005808 | 0.160922 | 0.014025 | 0.011842 |
| lasthtlordergap | -0.058852 | -0.001727 | -0.017532 | -0.036398 | 0.015938 | -0.015678 | -0.077737 | 0.014486 | -0.052847 | 0.013645 | -0.000045 | 0.013571 | 0.077568 | 0.017887 | 0.030016 | -0.041595 | -0.257930 | -0.036883 | 0.030867 | 0.027427 | 0.012611 | 0.081158 | 0.016245 | -0.140166 | 0.019501 | -0.011567 | 0.018249 | 0.015645 | -0.096864 | -0.001456 | 0.028872 | 0.030990 | 0.034105 | 0.031635 | 1.000000 | -0.049963 | -0.022230 | -0.024010 | 0.155794 | -0.278271 | -0.003972 | -0.036863 | 0.013465 | 0.077942 | 0.010573 | 0.003588 |
| businessrate_pre2 | 0.131459 | 0.001687 | 0.012048 | 0.462603 | 0.013720 | -0.028497 | 0.103838 | -0.005398 | 0.012737 | 0.009604 | -0.046877 | 0.048090 | 0.023504 | -0.021591 | -0.208543 | 0.839675 | 0.133643 | 0.531262 | -0.032136 | -0.100605 | -0.019580 | 0.032584 | -0.018595 | 0.065567 | -0.025075 | 0.144381 | -0.000377 | -0.002446 | 0.070251 | 0.090499 | -0.112667 | -0.250915 | -0.282605 | -0.142615 | -0.049963 | 1.000000 | 0.189061 | 0.205481 | 0.010175 | 0.138909 | 0.045270 | 0.011927 | 0.018820 | -0.193663 | -0.022434 | -0.035654 |
| cityuvs | 0.101187 | 0.000799 | 0.039251 | 0.010997 | -0.001178 | 0.045452 | 0.023551 | 0.090860 | 0.018577 | 0.095031 | 0.041321 | 0.151089 | 0.041702 | 0.020399 | 0.059585 | 0.170056 | 0.030157 | 0.012648 | 0.041208 | 0.042023 | -0.055406 | 0.030479 | 0.081078 | 0.046860 | 0.104632 | 0.077926 | 0.108625 | 0.085365 | 0.052718 | 0.086132 | 0.043150 | 0.047793 | 0.056401 | 0.055594 | -0.022230 | 0.189061 | 1.000000 | 0.987370 | 0.014811 | 0.059975 | 0.010518 | -0.012771 | -0.037693 | -0.298827 | 0.013699 | -0.037376 |
| cityorders | 0.102336 | 0.000759 | 0.037020 | 0.031117 | -0.000362 | 0.043500 | 0.027659 | 0.070892 | 0.013694 | 0.075434 | 0.031740 | 0.127710 | 0.039969 | 0.016766 | 0.029337 | 0.183577 | 0.034742 | 0.033255 | 0.031936 | 0.023651 | -0.067485 | 0.031175 | 0.061015 | 0.044189 | 0.078952 | 0.088606 | 0.083775 | 0.065592 | 0.050835 | 0.085994 | 0.024742 | 0.017647 | 0.021987 | 0.032771 | -0.024010 | 0.205481 | 0.987370 | 1.000000 | 0.014391 | 0.066153 | 0.010385 | -0.012186 | -0.041568 | -0.297744 | -0.006654 | -0.050069 |
| lastpvgap | 0.010801 | 0.001157 | -0.005508 | 0.008732 | -0.000853 | -0.054367 | -0.069852 | -0.017876 | -0.058683 | -0.017574 | -0.043740 | -0.018667 | -0.022239 | -0.008366 | -0.020363 | 0.008094 | -0.119503 | 0.006104 | -0.045533 | -0.027053 | -0.049631 | -0.017904 | -0.014918 | -0.095766 | -0.021515 | 0.028619 | -0.020732 | -0.014033 | -0.094942 | -0.000505 | -0.024253 | -0.016644 | -0.020083 | -0.030310 | 0.155794 | 0.010175 | 0.014811 | 0.014391 | 1.000000 | -0.026634 | -0.116780 | -0.041993 | 0.017968 | -0.013236 | 0.015911 | 0.001467 |
| cr | 0.184888 | 0.000509 | -0.000848 | 0.104793 | -0.134614 | -0.011540 | 0.181734 | -0.008408 | -0.007503 | -0.005125 | -0.060208 | -0.002253 | 0.040365 | 0.056799 | -0.045512 | 0.118519 | 0.282769 | 0.104715 | -0.023160 | -0.021063 | 0.003828 | 0.043167 | -0.011920 | 0.183976 | -0.014350 | 0.013600 | -0.009001 | -0.008970 | 0.159805 | 0.018554 | -0.016253 | -0.049527 | -0.053981 | -0.020870 | -0.278271 | 0.138909 | 0.059975 | 0.066153 | -0.026634 | 1.000000 | 0.057134 | -0.029451 | 0.013188 | -0.128660 | -0.018347 | -0.017570 |
| sid | 0.016431 | 0.002542 | -0.036166 | 0.018775 | 0.018306 | 0.039231 | 0.241964 | 0.063053 | 0.136853 | 0.063840 | 0.130227 | 0.069283 | 0.137399 | 0.053669 | 0.051564 | 0.042884 | 0.316929 | 0.017455 | 0.165819 | 0.109760 | 0.147123 | 0.132286 | 0.054643 | 0.282234 | 0.072572 | -0.062432 | 0.073381 | 0.054241 | 0.286329 | 0.022127 | 0.109393 | 0.045093 | 0.051593 | 0.128235 | -0.003972 | 0.045270 | 0.010518 | 0.010385 | -0.116780 | 0.057134 | 1.000000 | -0.000815 | -0.001223 | 0.021620 | -0.013090 | -0.010283 |
| visitnum_oneyear | -0.049722 | 0.000750 | 0.073917 | 0.012877 | 0.006269 | 0.108321 | 0.043761 | 0.005725 | 0.020643 | 0.007025 | 0.017015 | 0.004450 | 0.019858 | 0.020829 | 0.000132 | 0.010181 | 0.070454 | 0.020455 | 0.026597 | 0.023946 | 0.036845 | 0.017452 | 0.014873 | 0.060931 | 0.012653 | -0.024932 | 0.015749 | 0.017952 | 0.065248 | 0.001160 | 0.016503 | 0.006259 | 0.002284 | 0.021339 | -0.036863 | 0.011927 | -0.012771 | -0.012186 | -0.041993 | -0.029451 | -0.000815 | 1.000000 | -0.013557 | -0.030330 | -0.006335 | 0.000296 |
| h | -0.077728 | 0.001812 | -0.012849 | 0.019573 | -0.000564 | 0.001964 | -0.003672 | -0.002646 | -0.002809 | -0.002321 | 0.001819 | -0.004663 | 0.006350 | -0.000671 | -0.011013 | 0.015272 | -0.007748 | 0.014137 | 0.003963 | -0.006463 | -0.014246 | 0.007739 | 0.001200 | -0.004408 | -0.003364 | 0.010556 | -0.002815 | 0.001227 | 0.000656 | 0.001640 | -0.005723 | -0.006515 | -0.012470 | -0.005808 | 0.013465 | 0.018820 | -0.037693 | -0.041568 | 0.017968 | 0.013188 | -0.001223 | -0.013557 | 1.000000 | 0.032661 | -0.003044 | -0.007706 |
| day_advanced | -0.153983 | 0.000224 | -0.011125 | -0.138218 | 0.002098 | -0.034362 | -0.067534 | 0.074934 | 0.031312 | 0.071222 | 0.083663 | 0.067196 | 0.009858 | 0.037552 | 0.136684 | -0.164622 | -0.081668 | -0.142703 | 0.083902 | 0.130505 | 0.107125 | -0.000622 | 0.085242 | -0.004478 | 0.106090 | -0.089451 | 0.099393 | 0.081802 | -0.013352 | -0.008466 | 0.135596 | 0.149208 | 0.166380 | 0.160922 | 0.077942 | -0.193663 | -0.298827 | -0.297744 | -0.013236 | -0.128660 | 0.021620 | -0.030330 | 0.032661 | 1.000000 | -0.003262 | -0.040864 |
| arrival_weekday | 0.002958 | -0.001522 | 0.008014 | -0.012310 | 0.005056 | 0.008332 | -0.021776 | -0.013678 | -0.014619 | -0.013623 | -0.003290 | -0.014954 | -0.011647 | 0.005733 | -0.013809 | -0.022992 | -0.039014 | -0.012612 | -0.006420 | 0.015271 | -0.022543 | -0.012919 | -0.016114 | -0.026940 | -0.019686 | 0.013312 | -0.020253 | -0.016665 | -0.029033 | -0.004573 | 0.012064 | -0.018078 | -0.018840 | 0.014025 | 0.010573 | -0.022434 | 0.013699 | -0.006654 | 0.015911 | -0.018347 | -0.013090 | -0.006335 | -0.003044 | -0.003262 | 1.000000 | 0.748189 |
| is_arrival_weekend | -0.009220 | 0.001464 | 0.007079 | -0.018519 | -0.001970 | 0.030208 | -0.018639 | -0.007720 | -0.010775 | -0.007789 | -0.005326 | -0.008705 | -0.012393 | -0.000249 | -0.002814 | -0.031647 | -0.029886 | -0.020682 | -0.010198 | 0.011663 | -0.011237 | -0.012592 | -0.007082 | -0.024283 | -0.007454 | 0.002962 | -0.008146 | -0.007869 | -0.025471 | -0.004313 | 0.008757 | -0.004403 | -0.002906 | 0.011842 | 0.003588 | -0.035654 | -0.037376 | -0.050069 | 0.001467 | -0.017570 | -0.010283 | 0.000296 | -0.007706 | -0.040864 | 0.748189 | 1.000000 |
对用户特征相关分析¶
用户特征提取¶
In [ ]:
user_features=['visitnum_oneyear','starprefer','sid','price_sensitive','ordernum_oneyear','ordercanncelednum','ordercanceledprecent','lastpvgap',
'lasthtlordergap','landhalfhours','iforderpv_24h','historyvisit_totalordernum','historyvisit_avghotelnum','h',
'delta_price2','delta_price1','decisionhabit_user','customer_value_profit','ctrip_profits','cr','consuming_capacity','avgprice']
生成用户特征的相关性矩阵+热度图¶
In [ ]:
# train_corr_mat = train_data_rawdf[user_features].corr()
# test_corr_mat = test_data_rawdf[user_features].corr()
def corr_user(rawdf):
missing_columns = [col for col in user_features if col not in rawdf.columns]
copy_user_features = user_features.copy()
copy_user_features = [col for col in copy_user_features if col not in missing_columns]
print(copy_user_features)
mat = rawdf[copy_user_features].corr()
# 绘制用户特征的相关性矩阵热度图
fig, ax = plt.subplots(figsize=(18,12))
sns.heatmap(mat, xticklabels=True, yticklabels=True, square=False, linewidths=.5, annot=True, cmap="YlGnBu")
plt.show()
return mat
train_data_rawdf_mat = corr_user(train_data_rawdf)
train_data_rawdf_mat
# test_data_rawdf_mat = corr_user(test_data_rawdf)
# test_data_rawdf_mat
# 从热图中看出:
# >0.85
# delta_price1和delta_price2的相关性高达0.92,前者表示用户偏好价格-24小时浏览最多酒店价格,后者表示用户偏好价格-24小时浏览酒店平均价格,说明浏览24小时内浏览最多的酒店价格会影响到浏览酒店的平均价格,这可以理解为众数和平均数的关系。因此可以选择PCA提取一个主成分表示用户价格偏好。
# ordernum_oneyear和historyvisit_totalordernum的相关性高达1.0,两者都是表示用户1年内订单数,特征选取时可以只选择其一,这里选择ordernum_oneyear作为用户年订单数的特征,也可以用PCA降维;
# decisionhabit_user和historyvisit_avghotelnum的相关性达到了0.94,前者表示用户决策习惯,后者表示近三个月用户日均访问酒店数。说明决策时间久的用户近三个月访问酒店数的平均影响也越多,反过来也是,访问的酒店越多,该用户决策时间越久。
# customer_value_profit和ctrip_profits之间的相关性达到了0.85,前者表示用户近一年的价值,后者也表示用户价值,细分区别在于衡量的时间长度不同,这里也选择PCA提取一个主成分表示用户价值。
# consuming_capacity和avgprice之间的相关性达到了0.88,前者表示用户消费能力指数,后者表示酒店平均价格。很明显,消费能力越高,所选择的酒店平均价格大概率也越高。这里选择consuming_capacity来代表用户消费能力特征,也可以考虑用PCA降维综合这两个特征。
['visitnum_oneyear', 'starprefer', 'sid', 'price_sensitive', 'ordernum_oneyear', 'ordercanncelednum', 'ordercanceledprecent', 'lastpvgap', 'lasthtlordergap', 'landhalfhours', 'iforderpv_24h', 'h', 'delta_price2', 'delta_price1', 'customer_value_profit', 'ctrip_profits', 'cr', 'consuming_capacity', 'avgprice']
Out[ ]:
| visitnum_oneyear | starprefer | sid | price_sensitive | ordernum_oneyear | ordercanncelednum | ordercanceledprecent | lastpvgap | lasthtlordergap | landhalfhours | iforderpv_24h | h | delta_price2 | delta_price1 | customer_value_profit | ctrip_profits | cr | consuming_capacity | avgprice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| visitnum_oneyear | 1.000000 | 0.020643 | -0.000815 | 0.020829 | 0.070454 | 0.043761 | 0.006269 | -0.041993 | -0.036863 | 0.108321 | 0.073917 | -0.013557 | 0.017452 | 0.019858 | 0.060931 | 0.065248 | -0.029451 | 0.017015 | 0.026597 |
| starprefer | 0.020643 | 1.000000 | 0.136853 | 0.038752 | 0.120942 | 0.072213 | -0.069449 | -0.058683 | -0.052847 | 0.007570 | -0.025380 | -0.002809 | 0.401197 | 0.400740 | 0.385153 | 0.351759 | -0.007503 | 0.728197 | 0.666243 |
| sid | -0.000815 | 0.136853 | 1.000000 | 0.053669 | 0.316929 | 0.241964 | 0.018306 | -0.116780 | -0.003972 | 0.039231 | -0.036166 | -0.001223 | 0.132286 | 0.137399 | 0.282234 | 0.286329 | 0.057134 | 0.130227 | 0.165819 |
| price_sensitive | 0.020829 | 0.038752 | 0.053669 | 1.000000 | 0.019737 | 0.007129 | -0.033402 | -0.008366 | 0.017887 | -0.000761 | -0.022761 | -0.000671 | 0.224172 | 0.217116 | 0.154500 | 0.120446 | 0.056799 | 0.241250 | 0.245152 |
| ordernum_oneyear | 0.070454 | 0.120942 | 0.316929 | 0.019737 | 1.000000 | 0.697471 | 0.082158 | -0.119503 | -0.257930 | 0.024762 | -0.014624 | -0.007748 | 0.055956 | 0.060427 | 0.589498 | 0.513736 | 0.282769 | 0.059060 | 0.074879 |
| ordercanncelednum | 0.043761 | 0.072213 | 0.241964 | 0.007129 | 0.697471 | 1.000000 | 0.325672 | -0.069852 | -0.077737 | 0.011840 | -0.015206 | -0.003672 | 0.039229 | 0.043845 | 0.352043 | 0.323278 | 0.181734 | 0.014271 | 0.033386 |
| ordercanceledprecent | 0.006269 | -0.069449 | 0.018306 | -0.033402 | 0.082158 | 0.325672 | 1.000000 | -0.000853 | 0.015938 | -0.003579 | 0.012170 | -0.000564 | -0.073073 | -0.072026 | -0.019029 | -0.016109 | -0.134614 | -0.064174 | -0.069562 |
| lastpvgap | -0.041993 | -0.058683 | -0.116780 | -0.008366 | -0.119503 | -0.069852 | -0.000853 | 1.000000 | 0.155794 | -0.054367 | -0.005508 | 0.017968 | -0.017904 | -0.022239 | -0.095766 | -0.094942 | -0.026634 | -0.043740 | -0.045533 |
| lasthtlordergap | -0.036863 | -0.052847 | -0.003972 | 0.017887 | -0.257930 | -0.077737 | 0.015938 | 0.155794 | 1.000000 | -0.015678 | -0.017532 | 0.013465 | 0.081158 | 0.077568 | -0.140166 | -0.096864 | -0.278271 | -0.000045 | 0.030867 |
| landhalfhours | 0.108321 | 0.007570 | 0.039231 | -0.000761 | 0.024762 | 0.011840 | -0.003579 | -0.054367 | -0.015678 | 1.000000 | 0.233447 | 0.001964 | 0.001920 | 0.010923 | 0.007725 | 0.006446 | -0.011540 | 0.002310 | 0.003704 |
| iforderpv_24h | 0.073917 | -0.025380 | -0.036166 | -0.022761 | -0.014624 | -0.015206 | 0.012170 | -0.005508 | -0.017532 | 0.233447 | 1.000000 | -0.012849 | -0.026489 | -0.022466 | -0.029591 | -0.025420 | -0.000848 | -0.029280 | -0.040083 |
| h | -0.013557 | -0.002809 | -0.001223 | -0.000671 | -0.007748 | -0.003672 | -0.000564 | 0.017968 | 0.013465 | 0.001964 | -0.012849 | 1.000000 | 0.007739 | 0.006350 | -0.004408 | 0.000656 | 0.013188 | 0.001819 | 0.003963 |
| delta_price2 | 0.017452 | 0.401197 | 0.132286 | 0.224172 | 0.055956 | 0.039229 | -0.073073 | -0.017904 | 0.081158 | 0.001920 | -0.026489 | 0.007739 | 1.000000 | 0.909763 | 0.337208 | 0.322430 | 0.043167 | 0.599844 | 0.724924 |
| delta_price1 | 0.019858 | 0.400740 | 0.137399 | 0.217116 | 0.060427 | 0.043845 | -0.072026 | -0.022239 | 0.077568 | 0.010923 | -0.022466 | 0.006350 | 0.909763 | 1.000000 | 0.334857 | 0.320884 | 0.040365 | 0.589737 | 0.707538 |
| customer_value_profit | 0.060931 | 0.385153 | 0.282234 | 0.154500 | 0.589498 | 0.352043 | -0.019029 | -0.095766 | -0.140166 | 0.007725 | -0.029591 | -0.004408 | 0.337208 | 0.334857 | 1.000000 | 0.846496 | 0.183976 | 0.459208 | 0.490475 |
| ctrip_profits | 0.065248 | 0.351759 | 0.286329 | 0.120446 | 0.513736 | 0.323278 | -0.016109 | -0.094942 | -0.096864 | 0.006446 | -0.025420 | 0.000656 | 0.322430 | 0.320884 | 0.846496 | 1.000000 | 0.159805 | 0.420644 | 0.456162 |
| cr | -0.029451 | -0.007503 | 0.057134 | 0.056799 | 0.282769 | 0.181734 | -0.134614 | -0.026634 | -0.278271 | -0.011540 | -0.000848 | 0.013188 | 0.043167 | 0.040365 | 0.183976 | 0.159805 | 1.000000 | -0.060208 | -0.023160 |
| consuming_capacity | 0.017015 | 0.728197 | 0.130227 | 0.241250 | 0.059060 | 0.014271 | -0.064174 | -0.043740 | -0.000045 | 0.002310 | -0.029280 | 0.001819 | 0.599844 | 0.589737 | 0.459208 | 0.420644 | -0.060208 | 1.000000 | 0.899684 |
| avgprice | 0.026597 | 0.666243 | 0.165819 | 0.245152 | 0.074879 | 0.033386 | -0.069562 | -0.045533 | 0.030867 | 0.003704 | -0.040083 | 0.003963 | 0.724924 | 0.707538 | 0.490475 | 0.456162 | -0.023160 | 0.899684 | 1.000000 |
酒店信息特征相关性分析¶
酒店特征¶
In [ ]:
hotel_features=['hotelcr','hoteluv','commentnums','novoters','cancelrate','lowestprice','cr_pre','uv_pre','uv_pre2','businessrate_pre',
'businessrate_pre2','customereval_pre2','commentnums_pre','commentnums_pre2','cancelrate_pre','novoters_pre','novoters_pre2',
'deltaprice_pre2_t1','lowestprice_pre','lowestprice_pre2','historyvisit_visit_detailpagenum']
生成酒店特征的相关性矩阵¶
In [ ]:
# corr_mat1 = rawdf[hotel_features].corr()
def corr_hotel(rawdf):
missing_columns = [col for col in hotel_features if col not in rawdf.columns]
copy_user_features = hotel_features.copy()
copy_user_features = [col for col in copy_user_features if col not in missing_columns]
print(copy_user_features)
mat = rawdf[copy_user_features].corr()
# 绘制用户特征的相关性矩阵热度图
fig, ax = plt.subplots(figsize=(18,12))
sns.heatmap(mat, xticklabels=True, yticklabels=True, square=False, linewidths=.5, annot=True, cmap="Oranges")
plt.show()
return mat
train_data_rawdf_mat = corr_hotel(train_data_rawdf)
train_data_rawdf_mat
# test_data_rawdf_mat = corr_hotel(test_data_rawdf)
# test_data_rawdf_mat
# > 0.86
# novoters和commentnums的相关性高达0.99,两个特征高度相关。因此取commentnums特征进入后续的预测与分析就好,或者选择PCA提取一个主成分表示酒店评论数
# cencelrate和commentnums三者的相关性也很高达到了0.86,可以看出酒店的评论数和取消率有很高的关系,可能是由于用户选择酒店后会查看酒店的相关评价,酒店的评论信息越多,用户对酒店也越了解,因此退订数量越少。因此要鼓励用户对酒店进行评价。
# uv_pre和uv_pre2的相关性达到了0.9,它们都表示24小时历史浏览次数最多的酒店的独立访客数信息,因此可以选择PCA提取一个主成分分析表示4小时历史浏览次数最多的酒店的独立访客数信息。
# commentnums_pre和novoters_pre的相关性高达0.99,两个特征高度相关。因此选择PCA提取一个主成分表示24小时历史浏览次数最多酒店点评数。
# commentnums_pre2和novoters_pre2的相关性高达0.99,两个特征高度相关。因此选择PCA提取一个主成分表示24小时历史浏览次数最多酒店点评数均值。
['hotelcr', 'hoteluv', 'commentnums', 'novoters', 'cancelrate', 'lowestprice', 'cr_pre', 'uv_pre', 'uv_pre2', 'businessrate_pre', 'businessrate_pre2', 'customereval_pre2', 'commentnums_pre', 'commentnums_pre2', 'cancelrate_pre', 'novoters_pre', 'novoters_pre2', 'deltaprice_pre2_t1', 'lowestprice_pre', 'lowestprice_pre2']
Out[ ]:
| hotelcr | hoteluv | commentnums | novoters | cancelrate | lowestprice | cr_pre | uv_pre | uv_pre2 | businessrate_pre | businessrate_pre2 | customereval_pre2 | commentnums_pre | commentnums_pre2 | cancelrate_pre | novoters_pre | novoters_pre2 | deltaprice_pre2_t1 | lowestprice_pre | lowestprice_pre2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| hotelcr | 1.000000 | -0.195894 | 0.077973 | 0.111510 | 0.145374 | -0.212134 | 0.455088 | -0.193813 | -0.221810 | 0.393174 | 0.462603 | -0.018209 | -0.042322 | -0.057016 | 0.045860 | -0.028255 | -0.037198 | 0.025691 | -0.149879 | -0.184937 |
| hoteluv | -0.195894 | 1.000000 | 0.682082 | 0.678037 | 0.756565 | 0.372250 | -0.201410 | 0.611926 | 0.693862 | -0.206681 | -0.208543 | 0.401654 | 0.400708 | 0.508152 | -0.255675 | 0.397479 | 0.497864 | -0.011403 | 0.310592 | 0.364769 |
| commentnums | 0.077973 | 0.682082 | 1.000000 | 0.986627 | 0.837859 | 0.224027 | -0.048611 | 0.405871 | 0.454397 | -0.017608 | -0.005398 | 0.366761 | 0.415256 | 0.509763 | -0.214569 | 0.416398 | 0.511192 | 0.028964 | 0.220525 | 0.254302 |
| novoters | 0.111510 | 0.678037 | 0.986627 | 1.000000 | 0.853580 | 0.219171 | -0.033836 | 0.403685 | 0.451455 | -0.004879 | 0.009604 | 0.369402 | 0.417601 | 0.512748 | -0.217408 | 0.422311 | 0.518597 | 0.033301 | 0.218488 | 0.251498 |
| cancelrate | 0.145374 | 0.756565 | 0.837859 | 0.853580 | 1.000000 | 0.246753 | -0.022484 | 0.428912 | 0.480678 | 0.030727 | 0.048090 | 0.388280 | 0.408483 | 0.511110 | -0.202260 | 0.413909 | 0.515620 | 0.042032 | 0.239832 | 0.276399 |
| lowestprice | -0.212134 | 0.372250 | 0.224027 | 0.219171 | 0.246753 | 1.000000 | -0.157166 | 0.293774 | 0.336927 | -0.081715 | -0.100605 | 0.418775 | 0.197605 | 0.255902 | -0.176545 | 0.194376 | 0.251468 | 0.044972 | 0.482142 | 0.565141 |
| cr_pre | 0.455088 | -0.201410 | -0.048611 | -0.033836 | -0.022484 | -0.157166 | 1.000000 | -0.224403 | -0.249726 | 0.521287 | 0.531262 | -0.005069 | 0.051888 | -0.017631 | -0.067697 | 0.084736 | 0.013853 | 0.030638 | -0.215729 | -0.230065 |
| uv_pre | -0.193813 | 0.611926 | 0.405871 | 0.403685 | 0.428912 | 0.293774 | -0.224403 | 1.000000 | 0.899233 | -0.287193 | -0.250915 | 0.475066 | 0.684716 | 0.673500 | -0.320180 | 0.681359 | 0.666136 | -0.011070 | 0.371156 | 0.403663 |
| uv_pre2 | -0.221810 | 0.693862 | 0.454397 | 0.451455 | 0.480678 | 0.336927 | -0.249726 | 0.899233 | 1.000000 | -0.288367 | -0.282605 | 0.533003 | 0.600708 | 0.739316 | -0.330733 | 0.596813 | 0.727811 | -0.016392 | 0.391096 | 0.456633 |
| businessrate_pre | 0.393174 | -0.206681 | -0.017608 | -0.004879 | 0.030727 | -0.081715 | 0.521287 | -0.287193 | -0.288367 | 1.000000 | 0.839675 | -0.018557 | -0.033132 | -0.043829 | 0.113483 | -0.017423 | -0.022617 | 0.083459 | -0.093702 | -0.116378 |
| businessrate_pre2 | 0.462603 | -0.208543 | -0.005398 | 0.009604 | 0.048090 | -0.100605 | 0.531262 | -0.250915 | -0.282605 | 0.839675 | 1.000000 | -0.019580 | -0.018595 | -0.025075 | 0.144381 | -0.002446 | -0.000377 | 0.090499 | -0.112667 | -0.142615 |
| customereval_pre2 | -0.018209 | 0.401654 | 0.366761 | 0.369402 | 0.388280 | 0.418775 | -0.005069 | 0.475066 | 0.533003 | -0.018557 | -0.019580 | 1.000000 | 0.434567 | 0.542253 | -0.442958 | 0.436519 | 0.545988 | 0.025923 | 0.516774 | 0.592722 |
| commentnums_pre | -0.042322 | 0.400708 | 0.415256 | 0.417601 | 0.408483 | 0.197605 | 0.051888 | 0.684716 | 0.600708 | -0.033132 | -0.018595 | 0.434567 | 1.000000 | 0.821527 | -0.310531 | 0.986821 | 0.822742 | 0.038209 | 0.231631 | 0.261593 |
| commentnums_pre2 | -0.057016 | 0.508152 | 0.509763 | 0.512748 | 0.511110 | 0.255902 | -0.017631 | 0.673500 | 0.739316 | -0.043829 | -0.025075 | 0.542253 | 0.821527 | 1.000000 | -0.329677 | 0.815104 | 0.981899 | 0.045098 | 0.287523 | 0.332534 |
| cancelrate_pre | 0.045860 | -0.255675 | -0.214569 | -0.217408 | -0.202260 | -0.176545 | -0.067697 | -0.320180 | -0.330733 | 0.113483 | 0.144381 | -0.442958 | -0.310531 | -0.329677 | 1.000000 | -0.322329 | -0.340087 | -0.009206 | -0.220220 | -0.237741 |
| novoters_pre | -0.028255 | 0.397479 | 0.416398 | 0.422311 | 0.413909 | 0.194376 | 0.084736 | 0.681359 | 0.596813 | -0.017423 | -0.002446 | 0.436519 | 0.986821 | 0.815104 | -0.322329 | 1.000000 | 0.833242 | 0.043769 | 0.226831 | 0.256747 |
| novoters_pre2 | -0.037198 | 0.497864 | 0.511192 | 0.518597 | 0.515620 | 0.251468 | 0.013853 | 0.666136 | 0.727811 | -0.022617 | -0.000377 | 0.545988 | 0.822742 | 0.981899 | -0.340087 | 0.833242 | 1.000000 | 0.055052 | 0.282702 | 0.325686 |
| deltaprice_pre2_t1 | 0.025691 | -0.011403 | 0.028964 | 0.033301 | 0.042032 | 0.044972 | 0.030638 | -0.011070 | -0.016392 | 0.083459 | 0.090499 | 0.025923 | 0.038209 | 0.045098 | -0.009206 | 0.043769 | 0.055052 | 1.000000 | 0.071691 | 0.080952 |
| lowestprice_pre | -0.149879 | 0.310592 | 0.220525 | 0.218488 | 0.239832 | 0.482142 | -0.215729 | 0.371156 | 0.391096 | -0.093702 | -0.112667 | 0.516774 | 0.231631 | 0.287523 | -0.220220 | 0.226831 | 0.282702 | 0.071691 | 1.000000 | 0.847433 |
| lowestprice_pre2 | -0.184937 | 0.364769 | 0.254302 | 0.251498 | 0.276399 | 0.565141 | -0.230065 | 0.403663 | 0.456633 | -0.116378 | -0.142615 | 0.592722 | 0.261593 | 0.332534 | -0.237741 | 0.256747 | 0.325686 | 0.080952 | 0.847433 | 1.000000 |
订单字段相关性分析¶
In [ ]:
order_features = [ 'day_advanced', 'arrival_weekday', 'is_arrival_weekend' ,'ordercanceledprecent' ,'ordercanncelednum',
'lasthtlordergap', 'cityuvs', 'cityorders']
绘制订单特征的相关性矩阵热度图¶
In [ ]:
def order_hotel(rawdf):
missing_columns = [col for col in order_features if col not in rawdf.columns]
copy_user_features = order_features.copy()
copy_user_features = [col for col in copy_user_features if col not in missing_columns]
print(copy_user_features)
mat = rawdf[copy_user_features].corr()
# 绘制用户特征的相关性矩阵热度图
fig, ax = plt.subplots(figsize=(18,12))
sns.heatmap(mat, xticklabels=True, yticklabels=True, square=False, linewidths=.5, annot=True, cmap="Blues")
plt.show()
return mat
train_data_rawdf_mat = order_hotel(train_data_rawdf)
train_data_rawdf_mat
# test_data_rawdf_mat = order_hotel(test_data_rawdf)
# test_data_rawdf_mat
# 看出cityorders和cityuvs存在0.99的相关性,需要
#
#
# 降维
['day_advanced', 'arrival_weekday', 'is_arrival_weekend', 'ordercanceledprecent', 'ordercanncelednum', 'lasthtlordergap', 'cityuvs', 'cityorders']
Out[ ]:
| day_advanced | arrival_weekday | is_arrival_weekend | ordercanceledprecent | ordercanncelednum | lasthtlordergap | cityuvs | cityorders | |
|---|---|---|---|---|---|---|---|---|
| day_advanced | 1.000000 | -0.003262 | -0.040864 | 0.002098 | -0.067534 | 0.077942 | -0.298827 | -0.297744 |
| arrival_weekday | -0.003262 | 1.000000 | 0.748189 | 0.005056 | -0.021776 | 0.010573 | 0.013699 | -0.006654 |
| is_arrival_weekend | -0.040864 | 0.748189 | 1.000000 | -0.001970 | -0.018639 | 0.003588 | -0.037376 | -0.050069 |
| ordercanceledprecent | 0.002098 | 0.005056 | -0.001970 | 1.000000 | 0.325672 | 0.015938 | -0.001178 | -0.000362 |
| ordercanncelednum | -0.067534 | -0.021776 | -0.018639 | 0.325672 | 1.000000 | -0.077737 | 0.023551 | 0.027659 |
| lasthtlordergap | 0.077942 | 0.010573 | 0.003588 | 0.015938 | -0.077737 | 1.000000 | -0.022230 | -0.024010 |
| cityuvs | -0.298827 | 0.013699 | -0.037376 | -0.001178 | 0.023551 | -0.022230 | 1.000000 | 0.987370 |
| cityorders | -0.297744 | -0.006654 | -0.050069 | -0.000362 | 0.027659 | -0.024010 | 0.987370 | 1.000000 |
分别筛选用户和酒店很相关的维度进行降维度¶
In [ ]:
# 降维是指在某些限定条件下,降低随机变量(特征)个数,得到一组“不相关”主变量的过程。
# 这里使用主成分分析(PCA) 对相关度大于0.8的变量进行降维。
# 由上图的相关性分析矩阵,我们分别筛选用户和酒店很相关的维度进行降维度。
c_value=['customer_value_profit','ctrip_profits'] # 用户价值维度
consume_level=['avgprice','consuming_capacity'] # 用户消费水平
price_prefer=['delta_price1','delta_price2'] # 用户偏好价格
ordernum_1_year = ['ordernum_oneyear', ] # 用户一年历史订单数
hotel_hot=['commentnums','novoters'] # 酒店热度
hotel_hot_pre=['commentnums_pre','novoters_pre'] # 24h内浏览次数最多的酒店热度
hotel_hot_pre2=['commentnums_pre2','novoters_pre2'] # 24h内酒浏览酒店的平均热度
hotel_uv_pre = ['uv_pre', 'uv_pre2'] # 24小时历史浏览次数最多的酒店的独立访客数
order_cityuvs_orders = ['cityorders','cityuvs'] # 昨日访问当前城市同入住日期的UV数和订单
In [ ]:
from sklearn.decomposition import PCA
pca=PCA(n_components=1)
def pca_data(rawdf):
missing_columns = [col for col in c_value if col in rawdf.columns]
if len(missing_columns)==0:
return
print('PCA降维前维度是:{}'.format(rawdf.shape)) # (684128, 40)
rawdf['c_value']=pca.fit_transform(rawdf[c_value])
rawdf['consume_level']=pca.fit_transform(rawdf[consume_level])
rawdf['price_prefer']=pca.fit_transform(rawdf[price_prefer])
rawdf['ordernum_1_year'] = pca.fit_transform(rawdf[ordernum_1_year])
rawdf['hotel_hot']=pca.fit_transform(rawdf[hotel_hot])
rawdf['hotel_hot_pre']=pca.fit_transform(rawdf[hotel_hot_pre])
rawdf['hotel_hot_pre2']=pca.fit_transform(rawdf[hotel_hot_pre2])
rawdf['hotel_uv_pre']=pca.fit_transform(rawdf[hotel_uv_pre])
rawdf['order_cityuvs_orders']=pca.fit_transform(rawdf[order_cityuvs_orders])
rawdf.drop(c_value,axis=1,inplace=True)
rawdf.drop(consume_level,axis=1,inplace=True)
rawdf.drop(price_prefer,axis=1,inplace=True)
rawdf.drop(ordernum_1_year,axis=1,inplace=True)
rawdf.drop(hotel_hot,axis=1,inplace=True)
rawdf.drop(hotel_hot_pre,axis=1,inplace=True)
rawdf.drop(hotel_hot_pre2,axis=1,inplace=True)
rawdf.drop(hotel_uv_pre,axis=1,inplace=True)
rawdf.drop(order_cityuvs_orders,axis=1,inplace=True)
print('PCA降维后维度是:{}'.format(rawdf.shape)) # (684128, 40)
pca_data(train_data_rawdf)
pca_data(test_data_rawdf)
PCA降维前维度是:(689945, 46) PCA降维后维度是:(689945, 38) PCA降维前维度是:(435075, 45) PCA降维后维度是:(435075, 37)
In [ ]:
isSame(train_data_rawdf,test_data_rawdf)
字段不完全一样。 在train_data中独有的字段: Index(['label'], dtype='object') 在test_data中独有的字段: Index([], dtype='object')
In [ ]:
train_data_rawdf.head(1)
train_data_rawdf_copy = train_data_rawdf.copy()
标准化处理¶
In [ ]:
from sklearn.preprocessing import StandardScaler
y=train_data_rawdf['label']
x = train_data_rawdf.drop('label', axis=1)
# 标准化
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)
# 使用 fit 方法计算了数据的均值和标准差,并使用 transform 方法对数据进行了标准化
# X_test_x = test_data_rawdf
三、建模¶
导入模型建模¶
In [ ]:
from sklearn.model_selection import train_test_split, GridSearchCV
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.2,random_state=80471)
逻辑回归¶
In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn import metrics
# clf = LogisticRegression()
# # 参数网格
# param_grid = {
# 'C': [0.001, 0.01, 0.1, 1, 10, 100],
# 'penalty': ['l1', 'l2'],
# 'solver': ['liblinear', 'saga']
# }
# # GridSearchCV 或 RandomizedSearchCV
# lr = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_prob = lr.predict_proba(X_test)[:, 1] # 预测1类的概率
y_pred = lr.predict(X_test) # 模型对测试集的预测结果
# print(y_prob)
# print(y_pred)
fpr_lr, tpr_lr, threshold_lr = metrics.roc_curve(y_test, y_prob) # # 获取真阳率、伪阳率、阈值
print(f"fpr_lr: {fpr_lr}")
print(f"tpr_lr: {tpr_lr}")
auc_lr = metrics.auc(fpr_lr, tpr_lr)
score_lr = metrics.accuracy_score(y_test, y_pred)
# 计算精确度和召回率
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
train_prob = lr.score(X_train, y_train)
test_prob = lr.score(X_test, y_test)
print(f"Training set score: {(1 - train_prob) * 1000}")
print(f"train_prob: {train_prob}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print('模型准确率为:{0}, AUC得分为:{1}'.format(score_lr, auc_lr))
print('============================================================')
print(classification_report(y_test, y_pred))
fpr_lr: [0. 0. 0. ... 0.99918155 0.99918155 1. ]
tpr_lr: [0.00000000e+00 2.64550265e-05 7.93650794e-05 ... 9.99973545e-01
1.00000000e+00 1.00000000e+00]
Training set score: 262.23285914094595
train_prob: 0.737767140859054
Precision: 0.5792074592074592
Recall: 0.16433862433862434
模型准确率为:0.7383776967729312, AUC得分为:0.7013352216691405
============================================================
precision recall f1-score support
0.0 0.75 0.95 0.84 100189
1.0 0.58 0.16 0.26 37800
accuracy 0.74 137989
macro avg 0.67 0.56 0.55 137989
weighted avg 0.70 0.74 0.68 137989
决策树¶
In [ ]:
from sklearn import tree
# 定义参数网格
param_grid = {
'max_depth': [3, 5],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2, 4],
'max_features': ['auto', 'sqrt', 'log2'],
'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random']
}
# 初始化决策树模型
grid_search = tree.DecisionTreeClassifier(random_state=42)
# 初始化 GridSearchCV 对象
dtc = GridSearchCV(estimator=grid_search, param_grid=param_grid, cv=5, scoring='accuracy')
# dtc = tree.DecisionTreeClassifier() # 建立决策树模型
dtc.fit(X_train,y_train)
# 获取最佳模型
best_dtc = dtc.best_estimator_
# 训练模型
y_prob = best_dtc.predict_proba(X_test)[:,1] # 预测1类的概率
y_pred = best_dtc.predict(X_test) # 模型对测试集的预测结果
fpr_dtc,tpr_dtc,threshod_dtc= metrics.roc_curve(y_test,y_prob) # 获取真阳率、伪阳率、阈值
score_dtc = metrics.accuracy_score(y_test,y_pred)
auc_dtc = metrics.auc(fpr_dtc,tpr_dtc)
print('模型准确率为:{0},AUC得分为:{1}'.format(score_dtc,auc_dtc))
print('============================================================')
print(classification_report(y_test,y_pred,labels=None,target_names=None,sample_weight=None, digits=2))
模型准确率为:0.7304785164034814,AUC得分为:0.632274001475835
============================================================
precision recall f1-score support
0.0 0.75 0.95 0.84 100189
1.0 0.53 0.14 0.22 37800
accuracy 0.73 137989
macro avg 0.64 0.55 0.53 137989
weighted avg 0.69 0.73 0.67 137989
xgboost¶
In [ ]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn import metrics
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# 定义参数网格
# param_grid = {
# 'max_depth': [6, 8],
# 'eta': [0.1, 0.2],
# 'subsample': [0.6, 0.7],
# 'colsample_bytree': [0.7, 0.8],
# 'min_child_weight': [1, 3, 5],
# 'gamma': [0, 0.1, 0.2]
# }
param_grid = {'colsample_bytree': [0.8], 'eta': [0.2], 'gamma': [0], 'max_depth': [8], 'min_child_weight': [1], 'subsample': [0.7]}
# 初始化 XGBoost 模型
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', nthread=8, silent=1)
# 初始化 GridSearchCV 对象
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='roc_auc')
# 训练模型
grid_search.fit(X_train, y_train)
# 获取最佳参数
print("Best parameters:", grid_search.best_params_)
# 获取最佳模型
best_xgb_model = grid_search.best_estimator_
# 使用最佳模型进行预测
y_prob_best = best_xgb_model.predict_proba(X_test)[:, 1]
y_pred_best = (y_prob_best >= 0.5).astype(int)
# 计算最佳模型的性能指标
fpr_best_xgb, tpr_best_xgb, _ = metrics.roc_curve(y_test, y_prob_best)
auc_best_xgb = metrics.auc(fpr_best_xgb, tpr_best_xgb)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
accuracy_best_xgb = metrics.accuracy_score(y_test, y_pred_best)
print('模型准确率为:{0}, AUC得分为:{1}'.format(accuracy_best_xgb, auc_best_xgb))
print('============================================================')
print(classification_report(y_test, y_pred_best))
# y_prob_true_data = best_xgb_model.predict_proba(test_data_1)
# y_pred_true_data = (y_prob_true_data >= 0.5).astype(int)
# test_data_rawdf['Predicted'] = y_pred_true_data
# # 保存到 CSV 文件
# test_data_rawdf.to_csv('true_data_predictions.csv', index=False)
# # 读取训练集和测试集
# dtrain = xgb.DMatrix(X_train, y_train)
# dtest = xgb.DMatrix(X_test)
# # 设置xgboost建模参数
# params={
# 'booster':'gbtree','objective': 'binary:logistic','eval_metric': 'auc',
# 'max_depth':8,'gamma':0,'lambda':2,'subsample':0.7,'colsample_bytree':0.8,
# 'min_child_weight':3,'eta': 0.2,'nthread':8,'silent':1}
# # 训练模型
# watchlist = [(dtrain,'train')]
# bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)
# # 输入预测为正确的概率
# y_prob = bst.predict(dtest)
# # 设置阈值为0.5,得到测试集的测试结果
# y_pred = (y_pred >= 0.5)*1
# # 获取真阳率、伪阳率、阈值
# fpr_xgb,tpr_xgb,threshold_xgb = metrics.roc_curve(y_test,y_prob)
# auc_xgb = metrics.auc(fpr_xgb,tpr_xgb) # AUC得分
# # 计算精确度和召回率
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# # print(f"Precision: {precision}")
# # print(f"Recall: {recall}")
# score_xgb = metrics.accuracy_score(y_test,y_pred) # 模型准确率
# print('模型准确率为:{0},AUC得分为:{1}'.format(score_rfc,auc_rfc))
# print('============================================================')
# print(classification_report(y_test,y_pred,labels=None,target_names=None,sample_weight=None, digits=2))
# print(f"Precision: {precision}")
# print(f"Recall: {recall}")
Best parameters: {'colsample_bytree': 0.8, 'eta': 0.2, 'gamma': 0, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.7}
模型准确率为:0.818789903543036, AUC得分为:0.8605628164356667
============================================================
precision recall f1-score support
0.0 0.82 0.96 0.88 100189
1.0 0.80 0.46 0.58 37800
accuracy 0.82 137989
macro avg 0.81 0.71 0.73 137989
weighted avg 0.82 0.82 0.80 137989
XGBoost¶
阈值调整¶
画对比结果¶
In [ ]:
plt.style.use('bmh')
plt.figure(figsize=(10,10))
plt.plot(fpr_lr, tpr_lr, label='逻辑回归:%.3f' % score_lr ) # 逻辑回归
# plt.plot(fpr_gnb,tpr_gnb,label='gnb:{0:.3f}'.format(score_gnb)) # 朴素贝叶斯
# plt.plot(fpr_svc,tpr_svc,label='svc:{0:.3f}'.format(score_svc)) # 支持向量机
plt.plot(fpr_dtc,tpr_dtc,label='决策树:{0:.3f}'.format(score_dtc)) # 决策树
# plt.plot(fpr_rfc,tpr_rfc,label='rfc:{0:.3f}'.format(score_rfc)) # 随机森林
plt.plot(fpr_best_xgb, tpr_best_xgb,label='XGBoost:{0:.3f}'.format(accuracy_best_xgb)) # XGBoost
plt.plot([0, 1], [0, 1], 'k--', label='')
plt.legend(loc='lower right', prop={'size':15})
plt.xlabel('伪阳率')
plt.ylabel('真阳率')
plt.title('ROC曲线')
# plt.savefig('./images/模型比较ROC曲线图.jpg',dpi=400, bbox_inches='tight')
plt.show()
四、画像¶
RFM¶
In [ ]:
# RFM模型,即为:
# R(Rencency):最近一次消费
# F(Frequency):消费频率
# M(Monetary):消费金额
# 由于本数据集并没有直接给出这三个指标,经过分析,
# 选择选择lasthtlordergap(距离上次下单时长)、
# 和经过PCA降维处理的ordernum_1_year(用户年订单数)、consume_level(消费能力水平)分别作为R、F、M值,从而对我们的用户群体进行分群
rfm_features = ['lasthtlordergap','ordernum_1_year','consume_level']
rfm = train_data_rawdf[rfm_features]
# # 归一化(用于给RFM值打分)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(rfm)
rfm = pd.DataFrame(scaler.transform(rfm), columns=['recency', 'frequency','monetary'] )
# 分箱
rfm['R'] = pd.qcut(rfm['recency'], 2)
rfm['F'] = pd.qcut(rfm['frequency'], 2)
rfm['M'] = pd.qcut(rfm['monetary'], 2)
# # 根据分箱情况进行编码,二分类可以直接用标签编码方式
from sklearn.preprocessing import LabelEncoder
rfm['R'] = LabelEncoder().fit(rfm['R']).transform(rfm['R'])
rfm['F'] = LabelEncoder().fit(rfm['F']).transform(rfm['F'])
rfm['M'] = LabelEncoder().fit(rfm['M']).transform(rfm['M'])
#定义RFM模型,需要特别注意的是,R值代表距离上次消费时间间隔,值越小客户价值越高,与F和M值正好相反。
def get_label(r,f,m):
if (r==0)&(f==1)&(m==1):
return '高价值客户'
if (r==1)&(f==1)&(m==1):
return '重点保持客户'
if((r==0)&(f==0)&(m==1)):
return '重点发展客户'
if (r==1)&(f==0)&(m==1):
return '重点挽留客户'
if (r==0)&(f==1)&(m==0):
return '一般价值客户'
if (r==1)&(f==1)&(m==0):
return '一般保持客户'
if (r==0)&(f==0)&(m==0):
return '一般发展客户'
if (r==1)&(f==0)&(m==0):
return '潜在客户'
def RFM_convert(df):
df['Label'] = df.apply(lambda x:get_label(x['R'], x['F'], x['M']), axis=1)
df['R'] = np.where(df['R']==0, '高', '低')
df['F'] = np.where(df['F']==1, '高', '低')
df['M'] = np.where(df['M']==1, '高', '低')
return df[['R','F','M','Label']]
rfm0 = RFM_convert(rfm)
rfm0.head()
# 可视化
# label_cnt = rfm0.groupby('Label').size()
label_cnt = rfm0['Label'].value_counts().values
labels = rfm0['Label'].value_counts().index
explode=[0.1,0.1,0.1,0,0,0,0,0]
plt.figure(figsize=(14,18))
# colors=['orangered','lightsalmon','sienna','seashell','chocolate','peru','sandybrown','peachpuff']
plt.pie(label_cnt, labels=labels,radius=1, explode=explode, autopct='%.1f%%',pctdistance=0.75,
wedgeprops={'linewidth':0.5,'edgecolor':'black'}, textprops={'fontsize':14,'color':'black'})
# plt.pie([1],radius=0.6,colors='w')
plt.title("RFM客户分群情况")
plt.legend(labels, fontsize=14, loc='best')
# plt.savefig('./images/客户分群情况.jpg',dpi=400, bbox_inches='tight')
plt.show()
K-means¶
选取出刻画用户的重要指标¶
In [ ]:
# # In [36]
# from xgboost import plot_importance
# # 解决f特征名字
# def ceate_feature_map(features):
# outfile = open('xgb.fmap', 'w')
# i = 0
# for feat in features:
# outfile.write('{0}\t{1}\tq\n'.format(i, feat))
# i = i + 1
# outfile.close()
# fig, ax = plt.subplots(figsize=(15,15))
# plot_importance(best_xgb_model, height=0.5, ax=ax, max_num_features=40, color='green')
# plt.savefig('./重要性特征图.jpg', dpi=400, bbox_inches='tight')
# ceate_feature_map(train_data_rawdf.columns)
# plt.show()
from xgboost import plot_importance
import matplotlib.pyplot as plt
# 修正函数名并解决特征名字映射问题
def create_feature_map(features):
with open('xgb.fmap', 'w') as outfile:
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i += 1
# 绘制特征重要性图
fig, ax = plt.subplots(figsize=(15, 15))
plot_importance(best_xgb_model, height=0.5, ax=ax, max_num_features=40, color='green')
# 保存特征重要性图为图片文件
plt.savefig('./重要性特征图.jpg', dpi=400, bbox_inches='tight')
# 创建特征映射文件
create_feature_map(train_data_rawdf.columns)
# 显示图表
plt.show()
In [ ]:
import pandas as pd
# 获取特征重要性
importances = best_xgb_model.feature_importances_
# 获取特征名称(假设 train_data_rawdf 是您的训练数据集)
feature_names = train_data_rawdf.columns.tolist()
len(importances)
# 37
len(feature_names)
# 38
new_feature_names = feature_names.copy()
# new_feature_names
# 假设 new_feature_names 是您的特征名称列表
# 使用列表推导式删除 'label' 和 'sampleid'
new_feature_names = [feat for feat in new_feature_names if feat not in ['label']]
# , 'sampleid'
# 打印更新后的特征名称列表
print(new_feature_names)
len(new_feature_names)
len(importances)
# # 创建一个DataFrame来存储特征名称和它们的重要性得分
feature_importances = pd.DataFrame({'feature': new_feature_names, 'importance': importances})
# # 根据重要性得分对特征进行降序排序
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)
# # 获取最重要的15个特征
top_15_features = feature_importances_sorted.head(15)
# # 打印最重要的15个特征及其重要性得分
print(top_15_features)
['sampleid', 'iforderpv_24h', 'hotelcr', 'ordercanceledprecent', 'landhalfhours', 'ordercanncelednum', 'starprefer', 'cancelrate', 'price_sensitive', 'hoteluv', 'businessrate_pre', 'cr_pre', 'lowestprice', 'customereval_pre2', 'cancelrate_pre', 'deltaprice_pre2_t1', 'lowestprice_pre', 'lowestprice_pre2', 'lasthtlordergap', 'businessrate_pre2', 'lastpvgap', 'cr', 'sid', 'visitnum_oneyear', 'h', 'day_advanced', 'arrival_weekday', 'is_arrival_weekend', 'c_value', 'consume_level', 'price_prefer', 'ordernum_1_year', 'hotel_hot', 'hotel_hot_pre', 'hotel_hot_pre2', 'hotel_uv_pre', 'order_cityuvs_orders']
feature importance
1 iforderpv_24h 0.114401
25 day_advanced 0.078948
21 cr 0.074080
24 h 0.057563
2 hotelcr 0.042464
19 businessrate_pre2 0.036986
31 ordernum_1_year 0.033841
18 lasthtlordergap 0.029704
4 landhalfhours 0.029696
23 visitnum_oneyear 0.027455
22 sid 0.023670
28 c_value 0.023612
11 cr_pre 0.023009
5 ordercanncelednum 0.022008
20 lastpvgap 0.021730